Lets enjoy Python
print("Hello Abdulgafar")
Hello Abdulgafar
# Area of rectangle
length=40
breadth=6
Area=length*breadth
print('Area of Rectangle=',Area)
Area of Rectangle= 240
#len Function
len('LONDON')
6
Comparism
100==100
True
600!=600
False
p=54
type(p)
int
#int to float
h=float(p)
h
54.0
#float to int
x=12.9
type(x)
float
y=int(x)
y
12
USER INPUT
city=input("Enter any city Name = ")
Enter any city Name = London
city
'London'
a=input("Enter the value of a ")
Enter the value of a 12
b=input("Enter the value of b ")
Enter the value of b 20
int(a)+int(b)
32
#Area of a circle
pi=3.14
radius=float(input("Enter the raduis"))
print("radius=",radius)
area=pi*(radius**2)
print("The area of cicleis=",round(area,2))
Enter the raduis22 radius= 22.0 The area of cicleis= 1519.76
name=input("Enter any name is-: ")
Enter any name is-: Abdulgafar
#List
list1=[]
list2=[1,2,3,4,5,]
list3=["pen","book","chair"]
list4=[45,65,"pen","book",True ]
list4
[45, 65, 'pen', 'book', True]
Indexing
list2[-2]
4
slicing
list8=[45,67,89,54,82,90,22,43,21]
list8[0:3]
[45, 67, 89]
fruitlist=["kiwi","Guava","Grapes","pineaple","Berries","Banana","Apple","Mango"]
fruitlist
['kiwi', 'Guava', 'Grapes', 'pineaple', 'Berries', 'Banana', 'Apple', 'Mango']
fruitlist.pop()
'Mango'
fruitlist
['kiwi', 'Guava', 'Grapes', 'pineaple', 'Berries', 'Banana', 'Apple']
fruitlist[3]=("mango")
fruitlist
['kiwi', 'Guava', 'Grapes', 'mango', 'Berries', 'Banana', 'Apple']
fruitlist.insert(1,"nut")
fruitlist
['kiwi', 'nut', 'Guava', 'Grapes', 'mango', 'Berries', 'Banana', 'Apple']
fruitlist.clear()
fruitlist
['kiwi', 'nut', 'Guava', 'Grapes', 'mango', 'Berries', 'Banana', 'Apple']
fruitlist.append("Grapes")
fruitlist
['kiwi', 'nut', 'Guava', 'Grapes', 'mango', 'Berries', 'Banana', 'Apple', 'Grapes']
del fruitlist
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_10140\1934137106.py in <module> ----> 1 del fruitlist NameError: name 'fruitlist' is not defined
carlist1=["skoda","Innova","Hectar"]
carlist1
['skoda', 'Innova', 'Hectar']
carlist1.append("Thar")
carlist1
['skoda', 'Innova', 'Hectar', 'Thar']
carlist1.append(["Audi","BMW","Fortuner"])
carlist1
['skoda', 'Innova', 'Hectar', 'Thar', ['Audi', 'BMW', 'Fortuner']]
carlist2=["bolero","Santro","Tigor"]
carlist2
['bolero', 'Santro', 'Tigor']
carlist2.extend(["seltos","Bentley","Tiago","Harrier","tesla"])
carlist2
['bolero', 'Santro', 'Tigor', 'seltos', 'Bentley', 'Tiago', 'Harrier', 'tesla']
carlist2.append("Jaguar")
carlist2
['bolero', 'Santro', 'Tigor', 'seltos', 'Bentley', 'Tiago', 'Harrier', 'tesla', 'Jaguar']
carlist2[3]="Benz"
carlist2
['bolero', 'Santro', 'Tigor', 'Benz', 'Bentley', 'Tiago', 'Harrier', 'tesla', 'Jaguar']
carlist2.clear()
carlist2
[]
Tuples
t1=(1,24,54,9)
t1
(1, 24, 54, 9)
t1[2]
54
t1[0]
1
Dictionary- key-value-pair
d1={"employee":7010,"Name":"Abdul","age":34}
d1
{'employee': 7010, 'Name': 'Abdul', 'age': 34}
d1.keys()
dict_keys(['employee', 'Name', 'age'])
d1.values()
dict_values([7010, 'Abdul', 34])
d1.items()
dict_items([('employee', 7010), ('Name', 'Abdul'), ('age', 34)])
d1["employee"]
7010
d1["age"]=40
d1
{'employee': 7010, 'Name': 'Abdul', 'age': 40}
d1.update({"salary":56789})
d1
{'employee': 7010, 'Name': 'Abdul', 'age': 40, 'salary': 56789}
d1.pop("age")
40
d1.keys()
dict_keys(['employee', 'Name', 'salary'])
d1.values()
dict_values([7010, 'Abdul', 56789])
d2={1:"mercury",2:"venus",3:"earth",4:"mars"}
d2
{1: 'mercury', 2: 'venus', 3: 'earth', 4: 'mars'}
d3={'Empno':[1,2,3],'Name':['Jack','Jhon','Smith'],'Age':[34,35,38],}
d3
{'Empno': [1, 2, 3], 'Name': ['Jack', 'Jhon', 'Smith'], 'Age': [34, 35, 38]}
d3.keys()
dict_keys(['Empno', 'Name', 'Age'])
d3.values()
dict_values([[1, 2, 3], ['Jack', 'Jhon', 'Smith'], [34, 35, 38]])
d3.keys()
dict_keys(['Empno', 'Name', 'Age'])
d3.items()
dict_items([('Empno', [1, 2, 3]), ('Name', ['Jack', 'Jhon', 'Smith']), ('Age', [34, 35, 38])])
d3['Name'][2]
'Smith'
d3['Name'][2] ='Abdul'
d3
{'Empno': [1, 2, 3], 'Name': ['Jack', 'Jhon', 'Abdul'], 'Age': [34, 35, 38]}
d3.get('Name')
['Jack', 'Jhon', 'Abdul']
d3
{'Empno': [1, 2, 3], 'Name': ['Jack', 'Jhon', 'Abdul'], 'Age': [34, 35, 38]}
SET
s1={11,45,65,87,98,88,77}
s1
{11, 45, 65, 77, 87, 88, 98}
s2={11,11,56,76,76,99}
s2
{11, 56, 76, 99}
days={"Sunday","monday","tuesday","wednesday","thursday","friday"}
days
{'Sunday', 'friday', 'monday', 'thursday', 'tuesday', 'wednesday'}
WHILE-Loops
#initization
i=1
# condition
while (i<5):
print(i)
i=1+i #increment
1 2 3 4
i=1
while(i<7):
print(i)
i=i+1
1 2 3 4 5 6
k=1
while(k<4):
print(k)
k=k+1
1 2 3
j=1
while(j<=15):
print(j,end=" ")
j=j+1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
j=0
while(j<=40):
print(j,end=" ")
j=j+5
0 5 10 15 20 25 30 35 40
p=0
while(p<=5):
print(p)
print("loop")
p=p+1
print("End Of Loop")
0 loop 1 loop 2 loop 3 loop 4 loop 5 loop End Of Loop
p=0
while(p<=5):
print(p,end=" ")
print("loop")
p=p+1
print("End Of Loop")
0 loop 1 loop 2 loop 3 loop 4 loop 5 loop End Of Loop
i=1
while(i<=10):
print("2","*",i,"=",2*i)
i=i+1
2 * 1 = 2 2 * 2 = 4 2 * 3 = 6 2 * 4 = 8 2 * 5 = 10 2 * 6 = 12 2 * 7 = 14 2 * 8 = 16 2 * 9 = 18 2 * 10 = 20
i=1
j=input("Enter value for table generated ")
while(i<=10):
print(j,"*",i,"=",2*i)
i=i+1
Enter value for table generated 10 10 * 1 = 2 10 * 2 = 4 10 * 3 = 6 10 * 4 = 8 10 * 5 = 10 10 * 6 = 12 10 * 7 = 14 10 * 8 = 16 10 * 9 = 18 10 * 10 = 20
i=1
total=0
while(i<10):
total=total+i
print(total)
i=i+1
print("Grand total =",total)
1 3 6 10 15 21 28 36 45 Grand total = 45
For Loop
for index in [10,20,30]:
print(index)
10 20 30
for index in [55,66,77,88,99]:
print(index)
55 66 77 88 99
for i in [12,67,87,89]:
print(i)
12 67 87 89
for j in [10,30,74,90,98]:
print(j)
10 30 74 90 98
for k in (15,46,90,76):
print(k)
15 46 90 76
dict1={1:"sun",2:"monn",3:"planets"}
dict1
{1: 'sun', 2: 'monn', 3: 'planets'}
for i in dict1:
print(i)
1 2 3
for i in dict1:
print(dict1[i])
sun monn planets
for i in dict1:
print(i,"=>",dict1[i])
1 => sun 2 => monn 3 => planets
for j in [10,30,74,90,98]:
print(j)
print("loop")
print("End Of Loop")
10 loop 30 loop 74 loop 90 loop 98 loop End Of Loop
Range Function :
range(3)
range(0, 3)
list(range(5))
[0, 1, 2, 3, 4]
list(range(15))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
list(range(1,10))
[1, 2, 3, 4, 5, 6, 7, 8, 9]
list(range(2,20))
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
list(range(2,22,2))
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
list(range(5,55,5))
[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for i in range(5):
print(i)
0 1 2 3 4
for k in range(7):
print(k,end=" ")
0 1 2 3 4 5 6
for p in range(0,10,2):
print(p)
0 2 4 6 8
for p in range(0,20,2):
print(p,end=" ")
0 2 4 6 8 10 12 14 16 18
number=int(input("Enter number to be used for generating table "))
for i in range(1,11):
print(number,"*",i,"=",number*i)
Enter number to be used for generating table 10 10 * 1 = 10 10 * 2 = 20 10 * 3 = 30 10 * 4 = 40 10 * 5 = 50 10 * 6 = 60 10 * 7 = 70 10 * 8 = 80 10 * 9 = 90 10 * 10 = 100
citylist=[]
for p in range(3):
city=input("Enter any city name")
citylist.append(city)
print(citylist)
print("Complete city list= ",citylist)
Enter any city namekano ['kano'] Enter any city namelagos ['kano', 'lagos'] Enter any city nameibadan ['kano', 'lagos', 'ibadan'] Complete city list= ['kano', 'lagos', 'ibadan']
marks=90
if marks>50:
print("Good")
else:
print("Need to work hard")
Good
marks=90
if marks>50:
print("Good")
else:
pass
Good
marks=40
if marks>50:
print("Good")
else:
print("Need to work hard")
Need to work hard
num=15
if num%2==0:
print("Even number")
else:
print("odd number")
odd number
num=int(input("enter any number = "))
if num%2==0:
print("Even number")
else:
print("odd number")
enter any number = 10 Even number
marks=int(input("Enter any marks"))
if marks>90:
print("Excellent")
elif marks>80 and marks<90:
print("Very Good")
elif marks>70 and marks<80:
print("Good")
elif marks>60 and marks<70:
print("Average")
else:
print("Work hard")
#Break and continue statement:
for i in [10,20,30,40]:
print(i)
for i in [10,20,30,40]:
if i==30:
break
print(i)
for i in ['sun','moon','earth','water']:
if i=='moon':
break
print(i)
for i in [10,20,30,40]:
if i==30:
continue
print(i)
for i in ['sun','moon','earth','venus','water']:
if i=='venus':
continue
print(i)
#List comprehension
m=[i for i in [10,20,30,40,50,60]]
m
p=[i for i in range(1,20) if i%2==0]
p
k=[i for i in range(1,20) if i%2!=0]
k
String Function
world1="COMPUTER"
world1.lower()
world2="mobile"
world2.upper()
world2.upper()
w1=world2.upper()
w1
w1.isupper()
w1.islower()
world2.islower()
world3='planeTS'
world3.swapcase()
a='Twitter is a renamed by x'
a
a.split()
b="elon musk is a ceo of x"
b
b.capitalize()
b.title()
c="MarsRover lands on Mars Planet"
c
c.startswith('MarsRover')
c.endswith('Planet')
p="ABCDEF"
p.isalpha()
m="567786544"
m.isdigit()
m.isalpha()
k="698ASDF"
k
k.isalnum()
world="Elon musk launched AI Car Tesla"
world
len(world)
world.find('Car')
world
world.find('AI',16,30)
world.count('a')
world.count('s')
world1="NASA is building hotel project in Mars"
world1
world1.replace("NASA","SPACEX")
text=["NASA is building hotel project in Mars"]
text
''.join(text)
' '.join(text)
mystring=' '
mystring.join(text)
# casefold
string1="Tesla"
string2="tesla"
if string1.casefold()==string2.casefold():
print("The two strings are matching")
else:
print("The two string are not matching")
string1="Tesla"
string2="tesla"
if string1==string2:
print("The two strings are matching")
else:
print("The two string are not matching")
Functions
help(max)
help()
def function1():
print("AM in function1")
function1()
def myfunction():
print("inside myfunction")
myfunction()
def addfunction():
a=10
b=20
c=a+b
print("The sum is =",c)
addfunction()
def divisionfunction():
a=80
b=5
c=a/b
print("The division is",c)
divisionfunction()
def substract():
a=10
b=5
c=a-b
print("The substract=",c)
substract()
def square_function():
s=8
sq=s**2
print("The Square of s is",sq)
square_function()
def rectarea_func():
length=int(input("Enter length"))
breadth=int(input("Enter breadth"))
area=length*breadth
print("The area of rectanngle =",area)
rectarea_func()
def area_circle_function():
pi=3.14
radius=77
area_circle=pi*(radius**2)
print("The area of circle is= ",area_circle)
area_circle_function()
def area_circle_function():
pi=3.14
radius=float(input("Enter the value of radius"))
area_circle=pi*(radius**2)
print("The area of circle is= ",area_circle)
area_circle_function()
def add_func(a,b):
c=a+b
print("sum",c)
a=5
b=10
add_func(a,b)
def multiply_func(a,b):
c=a*b
print("Multiply=",c)
a=80
b=2
multiply_func(a,b)
def are_rectf(l,b):
area=l*b
print("Area of rectangle=",area)
l=49
b=9
are_rectf(l,b)
def substract_fun(l,b):
c=a-b
print("substract =",c)
l=int(input("Enter any length"))
b=int(input("Enter any breadth"))
substract_fun(l,b)
#variable number of argument--*args
def tesfunc(*args):
print(sum(args))
tesfunc(1,2)
tesfunc(1,2,3)
def testminfunc(*args):
print(min(args))
testminfunc(2,4,5,6,3)
def testminfunc(*args):
print(min(args))
testminfunc(90,80,45)
def testmaxfunc(*args):
print(max(args))
testmaxfunc(2,4,5,6,3)
# global variable
age=21
age
def testfuntion():
marks=80 # local variable
print(marks)
print("age = ",age)
testfuntion()
def addfunction():
a=10
b=5
result=a+b
return result
myvar=addfunction()
myvar**2
def multiplyfunc():
x=8
y=3
z=x*y
print(z)
multiplyfunc()
def f1():
for i in range(1,10):
if i==4:
return
print(i)
f1()
def f2(*args):
return(sum(args))
f2(1,2,3,4,5)
f2(2,4,9,10,24)
def my_sum(*args):
result = 0
#iterating over the python args
for x in args:
result += x
return result
print(my_sum(1,2,3,4))
def add(*numbers):
total= 0
for num in numbers:
total += num
return total
print(add(2,3))
print(add(2,3,4))
print(add(2,3,4,5))
print(add(2,3,4,5,6))
# **kwarg----> keywords argument
def concatenate(**kwargs):
result= ""
#iterating over the python kwargs dictionary
for arg in kwargs.values():
result += arg
return result
print(concatenate(a="Real ",b="python ",c="great ",e="enjoy"))
def concatenate(**kwargs):
result= ""
#iterating over the python kwargs dictionary
for arg in kwargs: # keys
result += arg
return result
print(concatenate(a="Real ",b="python ",c="great ",e="enjoy"))
def addfunc():
a=int(input("Enter a"))
b=int(input("Enter b"))
c=a+b
return c
def minusfunc():
x=80
y=7
z=x-y
return z
def calculator_func():
addresult=addfunc()
print("Addition",addresult)
minusresult=minusfunc()
print("minus",minusresult)
calculator_func()
# Lambda Funtions
#lambda variable: expression
t= lambda x:x+2
t(5)
t(80)
j=lambda p:p+10
j(50)
j(160)
h=lambda a:a**2
h(8)
h(12)
h(9)
t=lambda a,b:a*b
t(10,30)
t(90,10)
list1=[10,20,30,40]
list(map(lambda x:x+2,list1))
[12, 22, 32, 42]
list1=[10,20,30,40,50,60,70,80]
list(filter(lambda x:x>40,list1))
[50, 60, 70, 80]
def myf():
print('hello ')
i=0
while(i<5):
print(i)
i=i+1
print('hello')
0 hello 1 hello 2 hello 3 hello 4 hello
def testf(k):
result=30/k
print(result)
testf(2)
15.0
Enter value0 Handling run time error division by zero
k=int(input("Enter value"))
testf(k)
Enter value0
--------------------------------------------------------------------------- ZeroDivisionError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_20364\3180007702.py in <module> 1 k=int(input("Enter value")) ----> 2 testf(k) ~\AppData\Local\Temp\ipykernel_20364\292434318.py in testf(k) 1 def testf(k): ----> 2 result=30/k 3 print(result) ZeroDivisionError: division by zero
# how handle the error
try:
k=int(input("Enter value"))
testf(k)
except ZeroDivisionError as err:
print('Handling run time error',err)
Enter value0 Handling run time error division by zero
try:
i=int(input('value'))
p=50/i
except ZeroDivisionError:
print('Zero division error')
except NameError:
print('check name of variable :Name error')
else:
print(p)
value8 6.25
a=[10,20,30]
try:
print(a[0])
print(a[1])
print(a[2])
except:
print("out of range")
else:
print("hello")
finally:
print("end of the program")
10 20 30 hello end of the program
OOPS- Objective Oriented Programming System
class student:
#initialization constructor
def __init__(self,rollno,name):
self.rollno=rollno
self.name=name
def display(self):
print("Rollno=",self.rollno)
print("Name=",self.name)
# instance/object
stud1=student(1,'abdul')
stud1.display()
Rollno= 1 Name= abdul
stud2=student(2,'zainab')
stud2.display()
Rollno= 2 Name= zainab
stud1.name
'abdul'
stud3=student(3,'smit')
stud3.display()
Rollno= 3 Name= smit
class Employee:
company='Google' # class variable
def __init__(self,empno,name):
self.empno=empno
self.name=name
def display(self):
print('Empno',self.empno)
print('Name',self.name)
emp1=Employee(101,'Toyin')
emp1.display()
Empno 101 Name Toyin
Employee.company
'Google'
class car:
# simple class or cars
#construct to initialize
def __init__(self,company,color):
self.company=company
self.color=color
#function to print car company and color
def display(self):
print('This is a',self.color,self.company)
ca=car("Ferrari","Red")
ca.display()
This is a Red Ferrari
class Car:
def __init__(self, model, color):
self.model = model
self.color = color
def display(self):
print(f"Model: {self.model}, Color: {self.color}")
def main():
c = Car("Ferrari", "Red")
c.display()
if __name__ == "__main__":
main()
Model: Ferrari, Color: Red
class player:
def __init__(self,name,age,sport):
print('Iniatialized data for players')
self.name=name
self.age=age
self.sport=sport
def getdata(self,name,age,sport):
print("Getting data for players")
self.name=name
self.age=age
self.sport=sport
def displaydata(self):
print("Player details :",self.name,self.age,self.sport)
p1=player("Virat",35,"Cricket")
p1.displaydata()
p1.getdata("Richard",25,"Tennis")
p1.displaydata()
Iniatialized data for players Player details : Virat 35 Cricket Getting data for players Player details : Richard 25 Tennis
import keyword
keyword.kwlist
['False', 'None', 'True', '__peg_parser__', 'and', 'as', 'assert', 'async', 'await', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield']
import sys
sys.path
['C:\\Users\\USER', 'C:\\Users\\USER\\anaconda3\\python39.zip', 'C:\\Users\\USER\\anaconda3\\DLLs', 'C:\\Users\\USER\\anaconda3\\lib', 'C:\\Users\\USER\\anaconda3', '', 'C:\\Users\\USER\\anaconda3\\lib\\site-packages', 'C:\\Users\\USER\\anaconda3\\lib\\site-packages\\win32', 'C:\\Users\\USER\\anaconda3\\lib\\site-packages\\win32\\lib', 'C:\\Users\\USER\\anaconda3\\lib\\site-packages\\Pythonwin', 'C:\\Users\\USER\\anaconda3\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\USER\\.ipython']
import os
os.listdir()
['.ipynb_checkpoints',
'.ipython',
'.jupyter',
'anaconda3',
'AppData',
'Application Data',
'Contacts',
'Cookies',
'Desktop',
'Documents',
'Downloads',
'Favorites',
'Links',
'Local Settings',
'Microsoft',
'Music',
'My Documents',
'NetHood',
'NTUSER.DAT',
'ntuser.dat.LOG1',
'ntuser.dat.LOG2',
'NTUSER.DAT{f28145ce-bc8e-11ee-8044-a7aa5cff8d87}.TM.blf',
'NTUSER.DAT{f28145ce-bc8e-11ee-8044-a7aa5cff8d87}.TMContainer00000000000000000001.regtrans-ms',
'NTUSER.DAT{f28145ce-bc8e-11ee-8044-a7aa5cff8d87}.TMContainer00000000000000000002.regtrans-ms',
'ntuser.ini',
'OneDrive',
'OneDrive - Pearl The SE',
'Pictures',
'PrintHood',
'PycharmProjects',
'Recent',
'Saved Games',
'Searches',
'SendTo',
'Start Menu',
'Templates',
'Untitled.ipynb',
'Videos',
'Welcome To Python.ipynb']
import math
math.pi
3.141592653589793
math.sqrt(64)
8.0
math.sqrt(91)
9.539392014169456
math.pow(2,2)
4.0
math.pow(9,2)
81.0
math.__dict__.keys()
dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', 'acos', 'acosh', 'asin', 'asinh', 'atan', 'atan2', 'atanh', 'ceil', 'copysign', 'cos', 'cosh', 'degrees', 'dist', 'erf', 'erfc', 'exp', 'expm1', 'fabs', 'factorial', 'floor', 'fmod', 'frexp', 'fsum', 'gamma', 'gcd', 'hypot', 'isclose', 'isfinite', 'isinf', 'isnan', 'isqrt', 'lcm', 'ldexp', 'lgamma', 'log', 'log1p', 'log10', 'log2', 'modf', 'pow', 'radians', 'remainder', 'sin', 'sinh', 'sqrt', 'tan', 'tanh', 'trunc', 'prod', 'perm', 'comb', 'nextafter', 'ulp', 'pi', 'e', 'tau', 'inf', 'nan'])
import math
radius=98
area=math.pi*(radius**2)
print(area)
30171.855845076374
import math as mt
mt.sqrt(81)
9.0
radius=98
area=mt.pi*(radius**2)
print(area)
30171.855845076374
from math import pow
pow(4,2)
16.0
#get current date
import datetime
datetime.datetime.now()
datetime.datetime(2024, 3, 17, 3, 41, 21, 119324)
d1=datetime.date.today()
d1
datetime.date(2024, 3, 17)
datetime.date(2024,2,7)
datetime.date(2024, 2, 7)
d1.year
2024
from datetime import date
td=date.today()
td
datetime.date(2024, 3, 17)
print("Year=",td.year)
print("Year=",td.month)
print("Days=",td.day)
Year= 2024 Year= 3 Days= 17
date.today()
datetime.date(2024, 3, 10)
from datetime import time
t1=time(6,45,23)
t1.hour
6
t1.minute
45
t1.second
23
from datetime import time
a=time()
print(a)
# time(hour,minute,second)
b=time(12,43,56)
print(b)
00:00:00 12:43:56
print("hour",b.hour)
print("minute",b.minute)
print("second",b.second)
hour 12 minute 43 second 56
import time
time.localtime()
time.struct_time(tm_year=2024, tm_mon=3, tm_mday=17, tm_hour=6, tm_min=18, tm_sec=32, tm_wday=6, tm_yday=77, tm_isdst=0)
from datetime import datetime
a=datetime(2022,8,14)
print(a)
2022-08-14 00:00:00
print("year",a.year)
print("month",a.month)
print("day",a.day)
year 2022 month 8 day 14
b=datetime(2022,8,14,12,47,50)
print(b.year)
print(b.month)
print(b.day)
print(b.hour)
print(b.minute)
print(b.second)
2022 8 14 12 47 50
dd=open('c:\\now\\demomarch','w')
dd.write("we are enjoying python commands")
dd.close()
rfile=open('c:\\now\\demomarch','r')
print(rfile.read())
rfile.close()
we are enjoying python commands
appfile=open('c:\\now\\demomarch','a')
appfile.write("\n Republic day is coming this week")
appfile.close()
rfile=open('c:\\now\\demomarch','r')
print(rfile.read())
rfile.close()
we are enjoying python commands Republic day is coming this week
#code re-use inheritance
class Bird:
def speak(self):
print("Bird Speaking")
#child class sparrow inherits the base class Bird
class Sparrow(Bird):
def chirp(self):
print("Sparrow chirping")
#instance/object creation
sp= Sparrow()
sp.chirp()
sp.speak()
Sparrow chirping Bird Speaking
class Grandfather:
def speak(self):
print("Grandfather speaking")
class father(Grandfather):
def sleep(self):
print("father sleeping")
class child(father):
def eat(self):
print("Eating bread")
ch= child()
ch.speak()
ch.sleep()
ch.eat()
Grandfather speaking father sleeping Eating bread
# Multiple inheritance
class calculation1:
def summation(self,a,b):
return a+b;
class calculation2:
def multiplication(self,a,b):
return a*b;
class Derived(calculation1,calculation2):
def Divide(self,a,b):
return a/b;
d=Derived()
print(d.summation(10,20))
print(d.multiplication(10,20))
print(d.Divide(10,20))
30 200 0.5
# method overriding
class Animal:
def speak(self):
print("speaking")
class dog(Animal):
def speak(self):
print("Barking")
d=dog()
d.speak()
Barking
class Bank:
def getroi(self):
return 11;
class SBI(Bank):
def getroi(self):
return 7;
class ICICI(Bank):
def getroi(self):
return 8;
b1= Bank()
b2= SBI()
b3= ICICI()
print("Bank of Interest",b1.getroi());
print("SBI Rate of Interest",b2.getroi());
print("ICICI Rate of Interest",b3.getroi())
Bank of Interest 11 SBI Rate of Interest 7 ICICI Rate of Interest 8
#Data abstraction
class Employee:
__count = 0;
def __init__(self):
Employee.__count = Employee.__count+1
def display(self):
print("The number of employee",Employee.__count)
emp = Employee()
try:
print(emp.__count)
except(AttributeError):
print('AttributeError:Employee object has no attribute __count')
finally:
emp.display()
AttributeError:Employee object has no attribute __count The number of employee 1
import re
pattern = "Data science"
text = "Data science is a stream and it is of AI. You can solve many complex problems using Data science tools and techs"
match = re.findall(pattern,text)
print(match)
['Data science', 'Data science']
pattern = "\d+"
text = "There are 123 apples and 456 oranges"
matches= re.findall(pattern,text)
print(matches)
['123', '456']
price = ["apple cost £60","mango cost £100"]
for msg in price:
match = re.findall("\d+",msg)
print(match)
['60'] ['100']
test = "India China USA"
x = re.search("France",text)
print(x)
None
msg1 = "this product is really great"
search = re.search("^this.*great$",msg1)
print(search)
<re.Match object; span=(0, 28), match='this product is really great'>
import numpy as np
np.array([1,2,3,4,5,6])
array([1, 2, 3, 4, 5, 6])
a=np.array([11,22,13,14,15])
a
array([11, 22, 13, 14, 15])
b=np.array([12.4,60.7,8.8])
b
array([12.4, 60.7, 8.8])
c=np.array([44,65,90,87])
c
array([44, 65, 90, 87])
d=np.array([22,77,98,21,24,99,54])
d
array([22, 77, 98, 21, 24, 99, 54])
e=np.array([[1,2],[3,4]])
e
array([[1, 2],
[3, 4]])
e.shape
(2, 2)
f=np.array([[11,12],[13,14]])
f
array([[11, 12],
[13, 14]])
f.shape
(2, 2)
g=np.array([[10,20],[30,40],[50,60]])
g
array([[10, 20],
[30, 40],
[50, 60]])
g.shape
(3, 2)
h=np.array([[10,20,30],[40,50,60],[70,80,90]])
h
array([[10, 20, 30],
[40, 50, 60],
[70, 80, 90]])
h.shape
(3, 3)
h.ndim
2
h.max()
90
# total number of element
h.size
9
# bytesize in memory
h.itemsize
4
a.size
5
a
array([11, 22, 13, 14, 15])
a.shape
(5,)
i=np.array([[1,2,3],[4,5,6]])
i
array([[1, 2, 3],
[4, 5, 6]])
i.shape
(2, 3)
i.reshape(3,2)
array([[1, 2],
[3, 4],
[5, 6]])
p=np.arange(9)
p
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
p.reshape(3,3)
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
k=np.arange(35)
k
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34])
k.reshape(7,5)
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19],
[20, 21, 22, 23, 24],
[25, 26, 27, 28, 29],
[30, 31, 32, 33, 34]])
e.dtype
dtype('int32')
x=np.array([[34,56,],[24,56]],dtype='float64')
x
array([[34., 56.],
[24., 56.]])
x.dtype
dtype('float64')
x.itemsize
8
type(x)
numpy.ndarray
x
array([[34., 56.],
[24., 56.]])
x.ravel()
array([34., 56., 24., 56.])
np.zeros([3,4])
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
np.ones([3,4])
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])
np.arange(5,50)
array([ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])
np.arange(5,50,5)
array([ 5, 10, 15, 20, 25, 30, 35, 40, 45])
a=np.array([2,3,4])
a
array([2, 3, 4])
b=np.array([1,2,3])
b
array([1, 2, 3])
a+b
array([3, 5, 7])
a*b
array([ 2, 6, 12])
s=np.array([5,6,7])
s
array([5, 6, 7])
p=np.array([2])
p
array([2])
s*p
array([10, 12, 14])
# Broadcasting Numpy
a=np.array([20,30,40,50])
b=np.arange(4)
print(a)
print(b)
[20 30 40 50] [0 1 2 3]
c=a-b
print(c)
[20 29 38 47]
b**2
array([0, 1, 4, 9], dtype=int32)
b=np.random.random([2,3])
b
array([[0.35871876, 0.40111299, 0.8023355 ],
[0.94337907, 0.07529619, 0.43047038]])
c=np.ones([2,3])
c
array([[1., 1., 1.],
[1., 1., 1.]])
d=b+c
d
array([[1.35871876, 1.40111299, 1.8023355 ],
[1.94337907, 1.07529619, 1.43047038]])
Universal Functions
o=np.arange(4)
o
array([0, 1, 2, 3])
np.exp(o)
array([ 1. , 2.71828183, 7.3890561 , 20.08553692])
np.sqrt(o)
array([0. , 1. , 1.41421356, 1.73205081])
q=np.array([2,1,4,5])
q
array([2, 1, 4, 5])
np.add(q,o)
array([2, 2, 6, 8])
#index
q[2]
4
q
q[2:6]
array([4, 5])
a1=np.array([24,np.nan,25,90,np.nan])
a1
array([24., nan, 25., 90., nan])
np.random.rand(2,2)
array([[0.81646149, 0.70619368],
[0.15002024, 0.22030583]])
np.random.randn(2,2)
array([[ 0.1227025 , -2.01843524],
[-0.27544933, -1.35421677]])
# SERIES
import pandas as pd
pd.Series([1,2,3,4,5])
0 1 1 2 2 3 3 4 4 5 dtype: int64
a=pd.Series(data=[11,22,44,56,34,56,68],index=[1,2,3,4,5,6,7])
a
1 11 2 22 3 44 4 56 5 34 6 56 7 68 dtype: int64
b=pd.Series(data=[45,56,78,65,90],index=[10,11,12,13,14])
b
10 45 11 56 12 78 13 65 14 90 dtype: int64
import numpy as np
s=np.array([70,80,90,100])
s
array([ 70, 80, 90, 100])
pd.Series(s)
0 70 1 80 2 90 3 100 dtype: int32
dict1={"id":102,"Name":"Smith"}
dict1
{'id': 102, 'Name': 'Smith'}
k=pd.Series(dict1)
k
id 102 Name Smith dtype: object
# DATAFRAME
pd.DataFrame([1,2,3])
| 0 | |
|---|---|
| 0 | 1 |
| 1 | 2 |
| 2 | 3 |
pd.DataFrame(data=[10,20,30,40])
| 0 | |
|---|---|
| 0 | 10 |
| 1 | 20 |
| 2 | 30 |
| 3 | 40 |
e=[12,34,90,60]
L=pd.DataFrame(e)
L
| 0 | |
|---|---|
| 0 | 12 |
| 1 | 34 |
| 2 | 90 |
| 3 | 60 |
data=[['Alex',10],['Bob',12],['clarke',13]]
data
[['Alex', 10], ['Bob', 12], ['clarke', 13]]
pd.DataFrame(data)
| 0 | 1 | |
|---|---|---|
| 0 | Alex | 10 |
| 1 | Bob | 12 |
| 2 | clarke | 13 |
pd.DataFrame(data,columns=['Name','Age'])
| Name | Age | |
|---|---|---|
| 0 | Alex | 10 |
| 1 | Bob | 12 |
| 2 | clarke | 13 |
pd.DataFrame(data,columns=['Name','Age'],index=[1,2,3])
| Name | Age | |
|---|---|---|
| 1 | Alex | 10 |
| 2 | Bob | 12 |
| 3 | clarke | 13 |
pd.DataFrame(data,columns=['Name','Age'])
| Name | Age | |
|---|---|---|
| 0 | Alex | 10 |
| 1 | Bob | 12 |
| 2 | clarke | 13 |
pd.DataFrame(data,columns=['Name','Age'],dtype=int)
C:\Users\USER\AppData\Local\Temp\ipykernel_10140\2071186554.py:1: FutureWarning: Could not cast to int32, falling back to object. This behavior is deprecated. In a future version, when a dtype is passed to 'DataFrame', either all columns will be cast to that dtype, or a TypeError will be raised. pd.DataFrame(data,columns=['Name','Age'],dtype=int)
| Name | Age | |
|---|---|---|
| 0 | Alex | 10 |
| 1 | Bob | 12 |
| 2 | clarke | 13 |
dict2={'id':[1,2,3],'Name':['Abdullah','Muhammad','Faisal']}
pd.DataFrame(dict2)
| id | Name | |
|---|---|---|
| 0 | 1 | Abdullah |
| 1 | 2 | Muhammad |
| 2 | 3 | Faisal |
pd.DataFrame(dict2,columns=['id','Name'],index=[1,2,3])
| id | Name | |
|---|---|---|
| 1 | 1 | Abdullah |
| 2 | 2 | Muhammad |
| 3 | 3 | Faisal |
data = [{'a':1,'b':2},{'a':5,'b':10,'c':20}]
data
[{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data)
| a | b | c | |
|---|---|---|---|
| 0 | 1 | 2 | NaN |
| 1 | 5 | 10 | 20.0 |
pd.DataFrame(data,columns=['a','b'])
| a | b | |
|---|---|---|
| 0 | 1 | 2 |
| 1 | 5 | 10 |
pd.DataFrame(data,columns=['a','b1'])
| a | b1 | |
|---|---|---|
| 0 | 1 | NaN |
| 1 | 5 | NaN |
PANDAS VISUALIZATION
# BAR PLOT
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
print(df)
a b c d 0 0.084494 0.593562 0.701404 0.145777 1 0.179170 0.458246 0.601145 0.703916 2 0.985736 0.657061 0.547486 0.846934 3 0.456382 0.998602 0.335720 0.308651 4 0.487489 0.755290 0.409262 0.088529 5 0.006262 0.163425 0.221446 0.543407 6 0.209881 0.130309 0.007651 0.757832 7 0.748097 0.309950 0.931806 0.988432 8 0.836019 0.184422 0.325192 0.995035 9 0.329597 0.424228 0.655226 0.856977
df.plot.bar()
<AxesSubplot:>
df.plot.barh()
<AxesSubplot:>
df.plot.hist()
<AxesSubplot:ylabel='Frequency'>
df.plot.box()
<AxesSubplot:>
df.plot.scatter(x='a',y='b')
<AxesSubplot:xlabel='a', ylabel='b'>
df = pd.DataFrame({"Day":[1,2,3,4], "Visitors":[200,100,230,300],"Bounce_Rate":[20,45,60,10]})
print(df)
Day Visitors Bounce_Rate 0 1 200 20 1 2 100 45 2 3 230 60 3 4 300 10
df.set_index('Day',inplace=True)
df
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 1 | 200 | 20 |
| 2 | 100 | 45 |
| 3 | 230 | 60 |
| 4 | 300 | 10 |
df1 = df.rename(columns={"visitors":"Users"})
df1
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 1 | 200 | 20 |
| 2 | 100 | 45 |
| 3 | 230 | 60 |
| 4 | 300 | 10 |
data= {'Day':[1,2,3,4,5,6],"Visitors":[1000,700,6000,1000,400,350],"Bounce_Rate":[20,25,25,15,10,34]}
df = pd.DataFrame(data)
print(df)
Day Visitors Bounce_Rate 0 1 1000 20 1 2 700 25 2 3 6000 25 3 4 1000 15 4 5 400 10 5 6 350 34
df.set_index('Day',inplace=True)
df
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 1 | 1000 | 20 |
| 2 | 700 | 25 |
| 3 | 6000 | 25 |
| 4 | 1000 | 15 |
| 5 | 400 | 10 |
| 6 | 350 | 34 |
df.head()
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 1 | 1000 | 20 |
| 2 | 700 | 25 |
| 3 | 6000 | 25 |
| 4 | 1000 | 15 |
| 5 | 400 | 10 |
df.tail()
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 2 | 700 | 25 |
| 3 | 6000 | 25 |
| 4 | 1000 | 15 |
| 5 | 400 | 10 |
| 6 | 350 | 34 |
df.sample(3)
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 4 | 1000 | 15 |
| 5 | 400 | 10 |
| 1 | 1000 | 20 |
df.head(2)
| Visitors | Bounce_Rate | |
|---|---|---|
| Day | ||
| 1 | 1000 | 20 |
| 2 | 700 | 25 |
df2 = pd.read_csv('march17.csv',names=['studentid','studentname','studentage'],header=0)
df2
| studentid | studentname | studentage | |
|---|---|---|---|
| 0 | 1 | Smith | 10 |
| 1 | 2 | John | 12 |
| 2 | 3 | jack | 10 |
| 3 | 4 | Andrew | 17 |
| 4 | 5 | Abdullahi | 25 |
| 5 | 6 | Faisal | 19 |
df2 = pd.read_csv('march17.csv',names=['studentid','studentname','studentage'],header=0,index_col='studentid')
df2
| studentname | studentage | |
|---|---|---|
| studentid | ||
| 1 | Smith | 10 |
| 2 | John | 12 |
| 3 | jack | 10 |
| 4 | Andrew | 17 |
| 5 | Abdullahi | 25 |
| 6 | Faisal | 19 |
df2 = pd.read_csv('march17.csv',sep=',')
df2
| Sid | Name | Age | |
|---|---|---|---|
| 0 | 1 | Smith | 10 |
| 1 | 2 | John | 12 |
| 2 | 3 | jack | 10 |
| 3 | 4 | Andrew | 17 |
| 4 | 5 | Abdullahi | 25 |
| 5 | 6 | Faisal | 19 |
df2.to_csv('march17copy.csv')
df3=pd.read_csv('march17copy.csv')
df3
| Unnamed: 0 | Sid | Name | Age | |
|---|---|---|---|---|
| 0 | 0 | 1 | Smith | 10 |
| 1 | 1 | 2 | John | 12 |
| 2 | 2 | 3 | jack | 10 |
| 3 | 3 | 4 | Andrew | 17 |
| 4 | 4 | 5 | Abdullahi | 25 |
| 5 | 5 | 6 | Faisal | 19 |
#write in tsv format
# pd.read_csv('agedata1.csv',sep='\t')
df4=pd.read_excel('targets.xlsx')
df4
| market | month | ns_target | gm_target | np_target | |
|---|---|---|---|---|---|
| 0 | Australia | 2021-09-01 | 1.165663e+07 | 5.142392e+06 | -1.148236e+06 |
| 1 | Austria | 2021-09-01 | 1.213599e+06 | 3.198106e+05 | -1.663868e+05 |
| 2 | Bangladesh | 2021-09-01 | 2.055217e+06 | 7.955627e+05 | 5.503872e+03 |
| 3 | Brazil | 2021-09-01 | 2.282879e+05 | 6.721141e+04 | -2.129470e+04 |
| 4 | Canada | 2021-09-01 | 1.764242e+07 | 8.592894e+06 | -3.818243e+06 |
| ... | ... | ... | ... | ... | ... |
| 316 | South Korea | 2022-08-01 | 2.711193e+07 | 9.367438e+06 | -9.161665e+05 |
| 317 | Spain | 2022-08-01 | 6.350836e+06 | 2.657180e+06 | 5.257603e+05 |
| 318 | Sweden | 2022-08-01 | 8.366638e+05 | 3.453694e+05 | 2.593323e+04 |
| 319 | United Kingdom | 2022-08-01 | 1.303080e+07 | 4.817224e+06 | -4.839716e+06 |
| 320 | USA | 2022-08-01 | 6.337197e+07 | 2.529686e+07 | -7.396649e+06 |
321 rows × 5 columns
df4.to_excel('targets.xlsx')
url=''
df5=pd.read_html(url)
df5
pip install requests
Requirement already satisfied: requests in c:\users\user\anaconda3\lib\site-packages (2.28.1) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\user\anaconda3\lib\site-packages (from requests) (1.26.11) Requirement already satisfied: certifi>=2017.4.17 in c:\users\user\anaconda3\lib\site-packages (from requests) (2022.9.14) Requirement already satisfied: idna<4,>=2.5 in c:\users\user\anaconda3\lib\site-packages (from requests) (3.3) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\user\anaconda3\lib\site-packages (from requests) (2.0.4) Note: you may need to restart the kernel to use updated packages.
import requests
response=requests.get('https://en.wikipedia.org/wiki/Data_science')
response.status_code
200
df=pd.read_csv('Head_Brain_csv')
df
| Gender | AgeRange | HeadSize | BrainWeight | |
|---|---|---|---|---|
| 0 | 1 | 1 | 4512 | 1530 |
| 1 | 1 | 1 | 3738 | 1297 |
| 2 | 1 | 1 | 4261 | 1335 |
| 3 | 1 | 1 | 3777 | 1282 |
| 4 | 1 | 1 | 4177 | 1590 |
| ... | ... | ... | ... | ... |
| 232 | 2 | 2 | 3214 | 1110 |
| 233 | 2 | 2 | 3394 | 1215 |
| 234 | 2 | 2 | 3233 | 1104 |
| 235 | 2 | 2 | 3352 | 1170 |
| 236 | 2 | 2 | 3391 | 1120 |
237 rows × 4 columns
PANDAS
import pandas as pd
df=pd.read_csv('eployee.csv1')
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df=pd.DataFrame(data=df)
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.head()
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
df.tail()
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.sample()
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
df.head(10)
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
df.tail(5)
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.sample(2)
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
df.dtypes
SNo int64 Name object Age float64 City object Country object Salary float64 dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22 entries, 0 to 21 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SNo 22 non-null int64 1 Name 22 non-null object 2 Age 19 non-null float64 3 City 19 non-null object 4 Country 22 non-null object 5 Salary 19 non-null float64 dtypes: float64(2), int64(1), object(3) memory usage: 1.2+ KB
df['Salary'].min()
3900.0
df['Salary'].max()
67666.0
df['Salary'].plot.bar()
<AxesSubplot:>
df['Age'].plot.bar()
<AxesSubplot:>
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
PANDAS INDEXING and SLICING
df.loc[:,'City']
0 Toronto 1 Edmonto 2 Toronto 3 HongKong 4 NaN 5 HongKong 6 BayArea 7 Hyderabad 8 newyork 9 newyork 10 HongKong 11 NaN 12 HongKong 13 NaN 14 Hyderabad 15 newyork 16 newyork 17 Edmonto 18 Congo 19 Angola 20 Finland 21 London Name: City, dtype: object
df.loc[:,'Name']
0 Tom 1 Lee 2 Dave 3 Anik 4 kamal 5 Geet 6 Steven 7 Ram 8 hari 9 yami 10 Anik 11 kamal 12 Geet 13 Steven 14 Ram 15 hari 16 yami 17 Lee 18 Andrew 19 soham 20 Grig 21 Kiara Name: Name, dtype: object
df.loc[:,['Salary','City']]
| Salary | City | |
|---|---|---|
| 0 | 20000.0 | Toronto |
| 1 | 3900.0 | Edmonto |
| 2 | 8000.0 | Toronto |
| 3 | 66672.0 | HongKong |
| 4 | NaN | NaN |
| 5 | 30007.0 | HongKong |
| 6 | 8300.0 | BayArea |
| 7 | 54666.0 | Hyderabad |
| 8 | 67666.0 | newyork |
| 9 | 8888.0 | newyork |
| 10 | 66672.0 | HongKong |
| 11 | 34344.0 | NaN |
| 12 | 30007.0 | HongKong |
| 13 | 8300.0 | NaN |
| 14 | 54666.0 | Hyderabad |
| 15 | 67666.0 | newyork |
| 16 | 8888.0 | newyork |
| 17 | NaN | Edmonto |
| 18 | 34432.0 | Congo |
| 19 | 34534.0 | Angola |
| 20 | 33451.0 | Finland |
| 21 | NaN | London |
df.iloc[:,0]
0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 11 10 12 11 13 12 14 13 15 14 16 15 17 16 18 17 19 18 20 19 21 20 22 21 23 Name: SNo, dtype: int64
df.iloc[:,1]
0 Tom 1 Lee 2 Dave 3 Anik 4 kamal 5 Geet 6 Steven 7 Ram 8 hari 9 yami 10 Anik 11 kamal 12 Geet 13 Steven 14 Ram 15 hari 16 yami 17 Lee 18 Andrew 19 soham 20 Grig 21 Kiara Name: Name, dtype: object
df.iloc[:,2]
0 28.0 1 31.0 2 34.0 3 26.0 4 30.0 5 NaN 6 43.0 7 38.0 8 50.0 9 NaN 10 26.0 11 30.0 12 22.0 13 43.0 14 38.0 15 NaN 16 27.0 17 31.0 18 34.0 19 32.0 20 26.0 21 27.0 Name: Age, dtype: float64
df.iloc[:,4]
0 Canada 1 Canada 2 Canada 3 Asia 4 America 5 Asia 6 America 7 Asia 8 America 9 America 10 Asia 11 America 12 Asia 13 America 14 Asia 15 America 16 America 17 Canada 18 Africa 19 Africa 20 Europe 21 Europe Name: Country, dtype: object
df.iloc[:,5]
0 20000.0 1 3900.0 2 8000.0 3 66672.0 4 NaN 5 30007.0 6 8300.0 7 54666.0 8 67666.0 9 8888.0 10 66672.0 11 34344.0 12 30007.0 13 8300.0 14 54666.0 15 67666.0 16 8888.0 17 NaN 18 34432.0 19 34534.0 20 33451.0 21 NaN Name: Salary, dtype: float64
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.iloc[0:4,0:4]
| SNo | Name | Age | City | |
|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto |
| 1 | 2 | Lee | 31.0 | Edmonto |
| 2 | 3 | Dave | 34.0 | Toronto |
| 3 | 4 | Anik | 26.0 | HongKong |
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.iloc[:,1]
0 Tom 1 Lee 2 Dave 3 Anik 4 kamal 5 Geet 6 Steven 7 Ram 8 hari 9 yami 10 Anik 11 kamal 12 Geet 13 Steven 14 Ram 15 hari 16 yami 17 Lee 18 Andrew 19 soham 20 Grig 21 Kiara Name: Name, dtype: object
df.iloc[:,4]
0 Canada 1 Canada 2 Canada 3 Asia 4 America 5 Asia 6 America 7 Asia 8 America 9 America 10 Asia 11 America 12 Asia 13 America 14 Asia 15 America 16 America 17 Canada 18 Africa 19 Africa 20 Europe 21 Europe Name: Country, dtype: object
df.iloc[0:4,0:4]
| SNo | Name | Age | City | |
|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto |
| 1 | 2 | Lee | 31.0 | Edmonto |
| 2 | 3 | Dave | 34.0 | Toronto |
| 3 | 4 | Anik | 26.0 | HongKong |
df.iloc[2:10,3:5]
| City | Country | |
|---|---|---|
| 2 | Toronto | Canada |
| 3 | HongKong | Asia |
| 4 | NaN | America |
| 5 | HongKong | Asia |
| 6 | BayArea | America |
| 7 | Hyderabad | Asia |
| 8 | newyork | America |
| 9 | newyork | America |
df.iloc[4:11,2:4]
| Age | City | |
|---|---|---|
| 4 | 30.0 | NaN |
| 5 | NaN | HongKong |
| 6 | 43.0 | BayArea |
| 7 | 38.0 | Hyderabad |
| 8 | 50.0 | newyork |
| 9 | NaN | newyork |
| 10 | 26.0 | HongKong |
df.iloc[0:6,3:5]
| City | Country | |
|---|---|---|
| 0 | Toronto | Canada |
| 1 | Edmonto | Canada |
| 2 | Toronto | Canada |
| 3 | HongKong | Asia |
| 4 | NaN | America |
| 5 | HongKong | Asia |
df.iloc[1:5,0:5]
| SNo | Name | Age | City | Country | |
|---|---|---|---|---|---|
| 1 | 2 | Lee | 31.0 | Edmonto | Canada |
| 2 | 3 | Dave | 34.0 | Toronto | Canada |
| 3 | 4 | Anik | 26.0 | HongKong | Asia |
| 4 | 5 | kamal | 30.0 | NaN | America |
df.iloc[7:12,3:]
| City | Country | Salary | |
|---|---|---|---|
| 7 | Hyderabad | Asia | 54666.0 |
| 8 | newyork | America | 67666.0 |
| 9 | newyork | America | 8888.0 |
| 10 | HongKong | Asia | 66672.0 |
| 11 | NaN | America | 34344.0 |
df.iloc[0:10,4:]
| Country | Salary | |
|---|---|---|
| 0 | Canada | 20000.0 |
| 1 | Canada | 3900.0 |
| 2 | Canada | 8000.0 |
| 3 | Asia | 66672.0 |
| 4 | America | NaN |
| 5 | Asia | 30007.0 |
| 6 | America | 8300.0 |
| 7 | Asia | 54666.0 |
| 8 | America | 67666.0 |
| 9 | America | 8888.0 |
df.iloc[:,-1]
0 20000.0 1 3900.0 2 8000.0 3 66672.0 4 NaN 5 30007.0 6 8300.0 7 54666.0 8 67666.0 9 8888.0 10 66672.0 11 34344.0 12 30007.0 13 8300.0 14 54666.0 15 67666.0 16 8888.0 17 NaN 18 34432.0 19 34534.0 20 33451.0 21 NaN Name: Salary, dtype: float64
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.iloc[:,-3]
0 Toronto 1 Edmonto 2 Toronto 3 HongKong 4 NaN 5 HongKong 6 BayArea 7 Hyderabad 8 newyork 9 newyork 10 HongKong 11 NaN 12 HongKong 13 NaN 14 Hyderabad 15 newyork 16 newyork 17 Edmonto 18 Congo 19 Angola 20 Finland 21 London Name: City, dtype: object
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.count()
SNo 22 Name 22 Age 19 City 19 Country 22 Salary 19 dtype: int64
df['Name'].count()
22
df['Salary'].count()
19
df['Name'].unique()
array(['Tom', 'Lee', 'Dave', 'Anik', 'kamal', 'Geet', 'Steven', 'Ram',
'hari', 'yami', 'Andrew', 'soham', 'Grig', 'Kiara'], dtype=object)
df['Name'].nunique()
14
df['City'].count()
19
df['City'].unique()
array(['Toronto', 'Edmonto', 'HongKong', nan, 'BayArea', 'Hyderabad',
'newyork', 'Congo', 'Angola', 'Finland', 'London'], dtype=object)
df['City'].nunique()
10
print(df['Name'].describe())
count 22 unique 14 top Lee freq 2 Name: Name, dtype: object
df.describe()
| SNo | Age | Salary | |
|---|---|---|---|
| count | 22.000000 | 19.000000 | 19.000000 |
| mean | 12.090909 | 32.421053 | 33739.947368 |
| std | 6.927578 | 7.174924 | 23151.188084 |
| min | 1.000000 | 22.000000 | 3900.000000 |
| 25% | 6.250000 | 27.000000 | 8888.000000 |
| 50% | 12.500000 | 31.000000 | 33451.000000 |
| 75% | 17.750000 | 36.000000 | 54666.000000 |
| max | 23.000000 | 50.000000 | 67666.000000 |
df.describe(include='all')
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| count | 22.000000 | 22 | 19.000000 | 19 | 22 | 19.000000 |
| unique | NaN | 14 | NaN | 10 | 5 | NaN |
| top | NaN | Lee | NaN | HongKong | America | NaN |
| freq | NaN | 2 | NaN | 4 | 8 | NaN |
| mean | 12.090909 | NaN | 32.421053 | NaN | NaN | 33739.947368 |
| std | 6.927578 | NaN | 7.174924 | NaN | NaN | 23151.188084 |
| min | 1.000000 | NaN | 22.000000 | NaN | NaN | 3900.000000 |
| 25% | 6.250000 | NaN | 27.000000 | NaN | NaN | 8888.000000 |
| 50% | 12.500000 | NaN | 31.000000 | NaN | NaN | 33451.000000 |
| 75% | 17.750000 | NaN | 36.000000 | NaN | NaN | 54666.000000 |
| max | 23.000000 | NaN | 50.000000 | NaN | NaN | 67666.000000 |
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
How to Drop Row $ Columns
df1=df.drop([18,19,20],axis=0)
df1
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.drop([1,2,3],axis=0,inplace=True)
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
df.drop(['Name'],axis=1,inplace=True)
df
| SNo | Age | City | Country | Salary | |
|---|---|---|---|---|---|
| 0 | 1 | 28.0 | Toronto | Canada | 20000.0 |
| 4 | 5 | 30.0 | NaN | America | NaN |
| 5 | 6 | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | NaN | newyork | America | 8888.0 |
| 10 | 12 | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | NaN | newyork | America | 67666.0 |
| 16 | 18 | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | 27.0 | London | Europe | NaN |
import pandas as pd
df=pd.read_csv('hiredata')
df
| Name | HireDate | Salary | SickDaysremaining | |
|---|---|---|---|---|
| 0 | Graham Chapman | 03/15/14 | 50000.0 | 10 |
| 1 | John Cleese | 06/01/15 | 65000.0 | 8 |
| 2 | Eric Idle | 05/12/14 | 45000.0 | 10 |
| 3 | Terry Jones | 11/01/13 | 70000.0 | 3 |
| 4 | Terry Gilliam | 08/12/14 | 48000.0 | 7 |
| 5 | Michael Palin | 05/23/13 | 66000.0 | 8 |
df['HireDate']=pd.to_datetime(df['HireDate'])
df
| Name | HireDate | Salary | SickDaysremaining | |
|---|---|---|---|---|
| 0 | Graham Chapman | 2014-03-15 | 50000.0 | 10 |
| 1 | John Cleese | 2015-06-01 | 65000.0 | 8 |
| 2 | Eric Idle | 2014-05-12 | 45000.0 | 10 |
| 3 | Terry Jones | 2013-11-01 | 70000.0 | 3 |
| 4 | Terry Gilliam | 2014-08-12 | 48000.0 | 7 |
| 5 | Michael Palin | 2013-05-23 | 66000.0 | 8 |
#dd/mm/yyyy
daterange=pd.date_range(start='11/01/2024',end='11/20/2024',freq='D')
daterange
DatetimeIndex(['2024-11-01', '2024-11-02', '2024-11-03', '2024-11-04',
'2024-11-05', '2024-11-06', '2024-11-07', '2024-11-08',
'2024-11-09', '2024-11-10', '2024-11-11', '2024-11-12',
'2024-11-13', '2024-11-14', '2024-11-15', '2024-11-16',
'2024-11-17', '2024-11-18', '2024-11-19', '2024-11-20'],
dtype='datetime64[ns]', freq='D')
daterange=pd.date_range(start='01/01/2024',end='12/20/2024',freq='M')
daterange
DatetimeIndex(['2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30',
'2024-05-31', '2024-06-30', '2024-07-31', '2024-08-31',
'2024-09-30', '2024-10-31', '2024-11-30'],
dtype='datetime64[ns]', freq='M')
import datetime
dt=datetime.datetime.now()
print(dt)
2024-03-29 06:52:31.750332
print(dt+pd.to_timedelta(10,unit='D'))
2024-04-08 06:52:31.750332
df['Day']=df['HireDate'].dt.day
df['Month']=df['HireDate'].dt.month
df['Year']=df['HireDate'].dt.year
print(df)
Name HireDate Salary SickDaysremaining Day Month Year 0 Graham Chapman 2014-03-15 50000.0 10 15 3 2014 1 John Cleese 2015-06-01 65000.0 8 1 6 2015 2 Eric Idle 2014-05-12 45000.0 10 12 5 2014 3 Terry Jones 2013-11-01 70000.0 3 1 11 2013 4 Terry Gilliam 2014-08-12 48000.0 7 12 8 2014 5 Michael Palin 2013-05-23 66000.0 8 23 5 2013
import pandas as pd
df=pd.read_csv('eployee.csv1')
print(df)
SNo Name Age City Country Salary 0 1 Tom 28.0 Toronto Canada 20000.0 1 2 Lee 31.0 Edmonto Canada 3900.0 2 3 Dave 34.0 Toronto Canada 8000.0 3 4 Anik 26.0 HongKong Asia 66672.0 4 5 kamal 30.0 NaN America NaN 5 6 Geet NaN HongKong Asia 30007.0 6 7 Steven 43.0 BayArea America 8300.0 7 8 Ram 38.0 Hyderabad Asia 54666.0 8 9 hari 50.0 newyork America 67666.0 9 11 yami NaN newyork America 8888.0 10 12 Anik 26.0 HongKong Asia 66672.0 11 13 kamal 30.0 NaN America 34344.0 12 14 Geet 22.0 HongKong Asia 30007.0 13 15 Steven 43.0 NaN America 8300.0 14 16 Ram 38.0 Hyderabad Asia 54666.0 15 17 hari NaN newyork America 67666.0 16 18 yami 27.0 newyork America 8888.0 17 19 Lee 31.0 Edmonto Canada NaN 18 20 Andrew 34.0 Congo Africa 34432.0 19 21 soham 32.0 Angola Africa 34534.0 20 22 Grig 26.0 Finland Europe 33451.0 21 23 Kiara 27.0 London Europe NaN
df1=pd.DataFrame(data=df,columns=['Name'])
df1
| Name | |
|---|---|
| 0 | Tom |
| 1 | Lee |
| 2 | Dave |
| 3 | Anik |
| 4 | kamal |
| 5 | Geet |
| 6 | Steven |
| 7 | Ram |
| 8 | hari |
| 9 | yami |
| 10 | Anik |
| 11 | kamal |
| 12 | Geet |
| 13 | Steven |
| 14 | Ram |
| 15 | hari |
| 16 | yami |
| 17 | Lee |
| 18 | Andrew |
| 19 | soham |
| 20 | Grig |
| 21 | Kiara |
df1=pd.DataFrame(data=df,columns=['Name','Age','Salary'])
df1
| Name | Age | Salary | |
|---|---|---|---|
| 0 | Tom | 28.0 | 20000.0 |
| 1 | Lee | 31.0 | 3900.0 |
| 2 | Dave | 34.0 | 8000.0 |
| 3 | Anik | 26.0 | 66672.0 |
| 4 | kamal | 30.0 | NaN |
| 5 | Geet | NaN | 30007.0 |
| 6 | Steven | 43.0 | 8300.0 |
| 7 | Ram | 38.0 | 54666.0 |
| 8 | hari | 50.0 | 67666.0 |
| 9 | yami | NaN | 8888.0 |
| 10 | Anik | 26.0 | 66672.0 |
| 11 | kamal | 30.0 | 34344.0 |
| 12 | Geet | 22.0 | 30007.0 |
| 13 | Steven | 43.0 | 8300.0 |
| 14 | Ram | 38.0 | 54666.0 |
| 15 | hari | NaN | 67666.0 |
| 16 | yami | 27.0 | 8888.0 |
| 17 | Lee | 31.0 | NaN |
| 18 | Andrew | 34.0 | 34432.0 |
| 19 | soham | 32.0 | 34534.0 |
| 20 | Grig | 26.0 | 33451.0 |
| 21 | Kiara | 27.0 | NaN |
df1[['Name','Age','Salary']].head(10)
| Name | Age | Salary | |
|---|---|---|---|
| 0 | Tom | 28.0 | 20000.0 |
| 1 | Lee | 31.0 | 3900.0 |
| 2 | Dave | 34.0 | 8000.0 |
| 3 | Anik | 26.0 | 66672.0 |
| 4 | kamal | 30.0 | NaN |
| 5 | Geet | NaN | 30007.0 |
| 6 | Steven | 43.0 | 8300.0 |
| 7 | Ram | 38.0 | 54666.0 |
| 8 | hari | 50.0 | 67666.0 |
| 9 | yami | NaN | 8888.0 |
df.head()
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
df[df['City']=='newyork']
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
# select * from df where city=newyork and salary > 50000
df[(df['City']=='newyork')&(df['Salary']>50000)]
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
# select * from df where Country=America OR Salary > 50000
df[(df['Country']=='America')|(df['Salary']>50000)]
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
#Select * From Where column is Null:
df[df['Salary'].isna()]
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
#Select * From Where column is not Null:
df[df['Salary'].notna()]
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
df5=df.dropna(axis=0)
print(df5)
SNo Name Age City Country Salary 0 1 Tom 28.0 Toronto Canada 20000.0 1 2 Lee 31.0 Edmonto Canada 3900.0 2 3 Dave 34.0 Toronto Canada 8000.0 3 4 Anik 26.0 HongKong Asia 66672.0 6 7 Steven 43.0 BayArea America 8300.0 7 8 Ram 38.0 Hyderabad Asia 54666.0 8 9 hari 50.0 newyork America 67666.0 10 12 Anik 26.0 HongKong Asia 66672.0 12 14 Geet 22.0 HongKong Asia 30007.0 14 16 Ram 38.0 Hyderabad Asia 54666.0 16 18 yami 27.0 newyork America 8888.0 18 20 Andrew 34.0 Congo Africa 34432.0 19 21 soham 32.0 Angola Africa 34534.0 20 22 Grig 26.0 Finland Europe 33451.0
df.isnull().sum()
SNo 0 Name 0 Age 3 City 3 Country 0 Salary 3 dtype: int64
df.dropna(axis=0,inplace=True)
df.isnull().sum()
SNo 0 Name 0 Age 0 City 0 Country 0 Salary 0 dtype: int64
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
import pandas as pd
raw_data={'subject_id':['1','2','3','4','5'],
'first_name':['Alex','Amy','Allen','Abdul','Halima'],
'last_name':['Ali','Abdullahi','Faisal','Abdulaziz','Basit']}
df_a=pd.DataFrame(raw_data,columns=['subject_id','first_name','last_name'])
df_a
| subject_id | first_name | last_name | |
|---|---|---|---|
| 0 | 1 | Alex | Ali |
| 1 | 2 | Amy | Abdullahi |
| 2 | 3 | Allen | Faisal |
| 3 | 4 | Abdul | Abdulaziz |
| 4 | 5 | Halima | Basit |
raw_data={'subject_id':['4','5','6','7','8'],
'first_name':['Billy','Segun','Ali','Jamal','Sodiq'],
'last_name':['Alimi','Basit','Fawaz','Kamal','Abdulyezid']}
df_b=pd.DataFrame(raw_data,columns=['subject_id','first_name','last_name'])
df_b
| subject_id | first_name | last_name | |
|---|---|---|---|
| 0 | 4 | Billy | Alimi |
| 1 | 5 | Segun | Basit |
| 2 | 6 | Ali | Fawaz |
| 3 | 7 | Jamal | Kamal |
| 4 | 8 | Sodiq | Abdulyezid |
df_new=pd.concat([df_a,df_b],ignore_index=True)
df_new
| subject_id | first_name | last_name | |
|---|---|---|---|
| 0 | 1 | Alex | Ali |
| 1 | 2 | Amy | Abdullahi |
| 2 | 3 | Allen | Faisal |
| 3 | 4 | Abdul | Abdulaziz |
| 4 | 5 | Halima | Basit |
| 5 | 4 | Billy | Alimi |
| 6 | 5 | Segun | Basit |
| 7 | 6 | Ali | Fawaz |
| 8 | 7 | Jamal | Kamal |
| 9 | 8 | Sodiq | Abdulyezid |
raw_data={'subject_id':['1','2','3','4','5','6','7','8','9','10'],
'test_id':[51,15,15,61,16,14,15,1,61,16]}
df_n=pd.DataFrame(raw_data,columns=['subject_id','test_id'])
df_n
| subject_id | test_id | |
|---|---|---|
| 0 | 1 | 51 |
| 1 | 2 | 15 |
| 2 | 3 | 15 |
| 3 | 4 | 61 |
| 4 | 5 | 16 |
| 5 | 6 | 14 |
| 6 | 7 | 15 |
| 7 | 8 | 1 |
| 8 | 9 | 61 |
| 9 | 10 | 16 |
pd.merge(df_n,df_new)
| subject_id | test_id | first_name | last_name | |
|---|---|---|---|---|
| 0 | 1 | 51 | Alex | Ali |
| 1 | 2 | 15 | Amy | Abdullahi |
| 2 | 3 | 15 | Allen | Faisal |
| 3 | 4 | 61 | Abdul | Abdulaziz |
| 4 | 4 | 61 | Billy | Alimi |
| 5 | 5 | 16 | Halima | Basit |
| 6 | 5 | 16 | Segun | Basit |
| 7 | 6 | 14 | Ali | Fawaz |
| 8 | 7 | 15 | Jamal | Kamal |
| 9 | 8 | 1 | Sodiq | Abdulyezid |
pd.merge(df_n,df_new, on='subject_id')
| subject_id | test_id | first_name | last_name | |
|---|---|---|---|---|
| 0 | 1 | 51 | Alex | Ali |
| 1 | 2 | 15 | Amy | Abdullahi |
| 2 | 3 | 15 | Allen | Faisal |
| 3 | 4 | 61 | Abdul | Abdulaziz |
| 4 | 4 | 61 | Billy | Alimi |
| 5 | 5 | 16 | Halima | Basit |
| 6 | 5 | 16 | Segun | Basit |
| 7 | 6 | 14 | Ali | Fawaz |
| 8 | 7 | 15 | Jamal | Kamal |
| 9 | 8 | 1 | Sodiq | Abdulyezid |
#merge with inner join
#inner join produces only the set of records that match in both table A and table b
pd.merge(df_a,df_b, on='subject_id', how='inner')
| subject_id | first_name_x | last_name_x | first_name_y | last_name_y | |
|---|---|---|---|---|---|
| 0 | 4 | Abdul | Abdulaziz | Billy | Alimi |
| 1 | 5 | Halima | Basit | Segun | Basit |
# outer join
pd.merge(df_a,df_b, on='subject_id', how='outer')
| subject_id | first_name_x | last_name_x | first_name_y | last_name_y | |
|---|---|---|---|---|---|
| 0 | 1 | Alex | Ali | NaN | NaN |
| 1 | 2 | Amy | Abdullahi | NaN | NaN |
| 2 | 3 | Allen | Faisal | NaN | NaN |
| 3 | 4 | Abdul | Abdulaziz | Billy | Alimi |
| 4 | 5 | Halima | Basit | Segun | Basit |
| 5 | 6 | NaN | NaN | Ali | Fawaz |
| 6 | 7 | NaN | NaN | Jamal | Kamal |
| 7 | 8 | NaN | NaN | Sodiq | Abdulyezid |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
a
array([1, 2, 3, 4, 5, 6])
b
array([2, 3, 4, 5, 6, 7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b)
plt.show()
s=np.array([24,56,43,67,89,90,96,76])
p=np.array([12,24,56,90,87,90,45,34])
plt.title('Linear Line')
plt.xlabel('p')
plt.ylabel('s')
plt.plot(p,s)
plt.show()
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b,marker='*')
plt.show()
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b,marker='*',color='red')
plt.show()
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b,'--g')
plt.show()
help(plt.plot)
Help on function plot in module matplotlib.pyplot:
plot(*args, scalex=True, scaley=True, data=None, **kwargs)
Plot y versus x as lines and/or markers.
Call signatures::
plot([x], y, [fmt], *, data=None, **kwargs)
plot([x], y, [fmt], [x2], y2, [fmt2], ..., **kwargs)
The coordinates of the points or line nodes are given by *x*, *y*.
The optional parameter *fmt* is a convenient way for defining basic
formatting like color, marker and linestyle. It's a shortcut string
notation described in the *Notes* section below.
>>> plot(x, y) # plot x and y using default line style and color
>>> plot(x, y, 'bo') # plot x and y using blue circle markers
>>> plot(y) # plot y using x as index array 0..N-1
>>> plot(y, 'r+') # ditto, but with red plusses
You can use `.Line2D` properties as keyword arguments for more
control on the appearance. Line properties and *fmt* can be mixed.
The following two calls yield identical results:
>>> plot(x, y, 'go--', linewidth=2, markersize=12)
>>> plot(x, y, color='green', marker='o', linestyle='dashed',
... linewidth=2, markersize=12)
When conflicting with *fmt*, keyword arguments take precedence.
**Plotting labelled data**
There's a convenient way for plotting objects with labelled data (i.e.
data that can be accessed by index ``obj['y']``). Instead of giving
the data in *x* and *y*, you can provide the object in the *data*
parameter and just give the labels for *x* and *y*::
>>> plot('xlabel', 'ylabel', data=obj)
All indexable objects are supported. This could e.g. be a `dict`, a
`pandas.DataFrame` or a structured numpy array.
**Plotting multiple sets of data**
There are various ways to plot multiple sets of data.
- The most straight forward way is just to call `plot` multiple times.
Example:
>>> plot(x1, y1, 'bo')
>>> plot(x2, y2, 'go')
- If *x* and/or *y* are 2D arrays a separate data set will be drawn
for every column. If both *x* and *y* are 2D, they must have the
same shape. If only one of them is 2D with shape (N, m) the other
must have length N and will be used for every data set m.
Example:
>>> x = [1, 2, 3]
>>> y = np.array([[1, 2], [3, 4], [5, 6]])
>>> plot(x, y)
is equivalent to:
>>> for col in range(y.shape[1]):
... plot(x, y[:, col])
- The third way is to specify multiple sets of *[x]*, *y*, *[fmt]*
groups::
>>> plot(x1, y1, 'g^', x2, y2, 'g-')
In this case, any additional keyword argument applies to all
datasets. Also this syntax cannot be combined with the *data*
parameter.
By default, each line is assigned a different style specified by a
'style cycle'. The *fmt* and line property parameters are only
necessary if you want explicit deviations from these defaults.
Alternatively, you can also change the style cycle using
:rc:`axes.prop_cycle`.
Parameters
----------
x, y : array-like or scalar
The horizontal / vertical coordinates of the data points.
*x* values are optional and default to ``range(len(y))``.
Commonly, these parameters are 1D arrays.
They can also be scalars, or two-dimensional (in that case, the
columns represent separate data sets).
These arguments cannot be passed as keywords.
fmt : str, optional
A format string, e.g. 'ro' for red circles. See the *Notes*
section for a full description of the format strings.
Format strings are just an abbreviation for quickly setting
basic line properties. All of these and more can also be
controlled by keyword arguments.
This argument cannot be passed as keyword.
data : indexable object, optional
An object with labelled data. If given, provide the label names to
plot in *x* and *y*.
.. note::
Technically there's a slight ambiguity in calls where the
second label is a valid *fmt*. ``plot('n', 'o', data=obj)``
could be ``plt(x, y)`` or ``plt(y, fmt)``. In such cases,
the former interpretation is chosen, but a warning is issued.
You may suppress the warning by adding an empty format string
``plot('n', 'o', '', data=obj)``.
Returns
-------
list of `.Line2D`
A list of lines representing the plotted data.
Other Parameters
----------------
scalex, scaley : bool, default: True
These parameters determine if the view limits are adapted to the
data limits. The values are passed on to `autoscale_view`.
**kwargs : `.Line2D` properties, optional
*kwargs* are used to specify properties like a line label (for
auto legends), linewidth, antialiasing, marker face color.
Example::
>>> plot([1, 2, 3], [1, 2, 3], 'go-', label='line 1', linewidth=2)
>>> plot([1, 2, 3], [1, 4, 9], 'rs', label='line 2')
If you specify multiple lines with one plot call, the kwargs apply
to all those lines. In case the label object is iterable, each
element is used as labels for each set of data.
Here is a list of available `.Line2D` properties:
Properties:
agg_filter: a filter function, which takes a (m, n, 3) float array and a dpi value, and returns a (m, n, 3) array
alpha: scalar or None
animated: bool
antialiased or aa: bool
clip_box: `.Bbox`
clip_on: bool
clip_path: Patch or (Path, Transform) or None
color or c: color
dash_capstyle: `.CapStyle` or {'butt', 'projecting', 'round'}
dash_joinstyle: `.JoinStyle` or {'miter', 'round', 'bevel'}
dashes: sequence of floats (on/off ink in points) or (None, None)
data: (2, N) array or two 1D arrays
drawstyle or ds: {'default', 'steps', 'steps-pre', 'steps-mid', 'steps-post'}, default: 'default'
figure: `.Figure`
fillstyle: {'full', 'left', 'right', 'bottom', 'top', 'none'}
gid: str
in_layout: bool
label: object
linestyle or ls: {'-', '--', '-.', ':', '', (offset, on-off-seq), ...}
linewidth or lw: float
marker: marker style string, `~.path.Path` or `~.markers.MarkerStyle`
markeredgecolor or mec: color
markeredgewidth or mew: float
markerfacecolor or mfc: color
markerfacecoloralt or mfcalt: color
markersize or ms: float
markevery: None or int or (int, int) or slice or list[int] or float or (float, float) or list[bool]
path_effects: `.AbstractPathEffect`
picker: float or callable[[Artist, Event], tuple[bool, dict]]
pickradius: float
rasterized: bool
sketch_params: (scale: float, length: float, randomness: float)
snap: bool or None
solid_capstyle: `.CapStyle` or {'butt', 'projecting', 'round'}
solid_joinstyle: `.JoinStyle` or {'miter', 'round', 'bevel'}
transform: unknown
url: str
visible: bool
xdata: 1D array
ydata: 1D array
zorder: float
See Also
--------
scatter : XY scatter plot with markers of varying size and/or color (
sometimes also called bubble chart).
Notes
-----
**Format Strings**
A format string consists of a part for color, marker and line::
fmt = '[marker][line][color]'
Each of them is optional. If not provided, the value from the style
cycle is used. Exception: If ``line`` is given, but no ``marker``,
the data will be a line without markers.
Other combinations such as ``[color][marker][line]`` are also
supported, but note that their parsing may be ambiguous.
**Markers**
============= ===============================
character description
============= ===============================
``'.'`` point marker
``','`` pixel marker
``'o'`` circle marker
``'v'`` triangle_down marker
``'^'`` triangle_up marker
``'<'`` triangle_left marker
``'>'`` triangle_right marker
``'1'`` tri_down marker
``'2'`` tri_up marker
``'3'`` tri_left marker
``'4'`` tri_right marker
``'8'`` octagon marker
``'s'`` square marker
``'p'`` pentagon marker
``'P'`` plus (filled) marker
``'*'`` star marker
``'h'`` hexagon1 marker
``'H'`` hexagon2 marker
``'+'`` plus marker
``'x'`` x marker
``'X'`` x (filled) marker
``'D'`` diamond marker
``'d'`` thin_diamond marker
``'|'`` vline marker
``'_'`` hline marker
============= ===============================
**Line Styles**
============= ===============================
character description
============= ===============================
``'-'`` solid line style
``'--'`` dashed line style
``'-.'`` dash-dot line style
``':'`` dotted line style
============= ===============================
Example format strings::
'b' # blue markers with default shape
'or' # red circles
'-g' # green solid line
'--' # dashed line with default color
'^k:' # black triangle_up markers connected by a dotted line
**Colors**
The supported color abbreviations are the single letter codes
============= ===============================
character color
============= ===============================
``'b'`` blue
``'g'`` green
``'r'`` red
``'c'`` cyan
``'m'`` magenta
``'y'`` yellow
``'k'`` black
``'w'`` white
============= ===============================
and the ``'CN'`` colors that index into the default property cycle.
If the color is the only part of the format string, you can
additionally use any `matplotlib.colors` spec, e.g. full names
(``'green'``) or hex strings (``'#008000'``).
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b,'bo')
plt.show()
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b,'r+')
plt.show()
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
plt.title('Linear line')
plt.xlabel('a')
plt.ylabel('b')
plt.plot(a,b,color='green',marker='o',linestyle='dashed')
plt.show()
x=[12,13,14,15,16]
y=[77,33,22,44,55]
plt.title('list')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(x,y,marker='o',color='green')
plt.show()
sales=[100,200,300,400,500]
fruitname=['Apple','Mango','Kiwi','Grapes','papaya']
plt.title('Sales ')
plt.xlabel('sales')
plt.ylabel('fruitname')
plt.plot(sales,fruitname,marker='o',color='blue')
plt.show()
sales=[100,200,300,400,500]
fruitname=['Apple','Mango','Kiwi','Grapes','papaya']
plt.title('Sales ')
plt.xlabel('fruit')
plt.ylabel('sale')
plt.bar(fruitname,sales,color='green',align='edge')
plt.show()
a=np.array([12,10,22,34,35,37,40,55,54,62,68,69,89,83,92])
plt.hist(a,bins=[0,10,20,30,40,50,60,70,80,90,100])
plt.title('histogram')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
import matplotlib.pyplot as plt
import pandas as pd
df=pd.read_csv('student_mark.csv')
print(df)
df.columns
Unnamed: 0 Gender DOB Maths Physics Chemistry English Biology \ 0 John M 05-04-1988 55 45 56 87 21 1 Suresh M 04-05-1987 75 96 78 64 90 2 Ramesh M 25-05-1989 25 54 89 76 95 3 Jessica F 12-08-1990 78 96 86 63 54 4 Jennifer F 02-09-1989 58 96 78 46 96 5 Annu F 05-04-1988 45 87 52 89 55 6 pooja F 04-05-1987 55 64 61 58 75 7 Ritesh M 25-05-1989 54 76 87 56 25 8 Farha F 12-08-1990 55 63 89 75 78 9 Mukesh M 02-09-1989 96 46 77 83 58 Economics History Civics 0 52 89 65 1 61 58 2 2 87 56 74 3 89 75 45 4 77 83 53 5 89 87 52 6 58 64 61 7 56 76 87 8 75 63 89 9 83 46 77
Index(['Unnamed: 0', 'Gender', 'DOB', 'Maths', 'Physics', 'Chemistry',
'English', 'Biology', 'Economics', 'History', 'Civics'],
dtype='object')
plt.hist(df['Maths'],bins=[0,20,40,60,80,100])
plt.title('Maths Scores')
plt.xlabel('Maths')
plt.ylabel('Frequency')
plt.show()
plt.boxplot(df['Maths'])
{'whiskers': [<matplotlib.lines.Line2D at 0x29a8cf999d0>,
<matplotlib.lines.Line2D at 0x29a8cf99430>],
'caps': [<matplotlib.lines.Line2D at 0x29a8cfc9280>,
<matplotlib.lines.Line2D at 0x29a8cfc9850>],
'boxes': [<matplotlib.lines.Line2D at 0x29a8cf99940>],
'medians': [<matplotlib.lines.Line2D at 0x29a8cfc9730>],
'fliers': [<matplotlib.lines.Line2D at 0x29a8cfc90a0>],
'means': []}
plt.boxplot(df['Chemistry'])
{'whiskers': [<matplotlib.lines.Line2D at 0x29a8e3aad60>,
<matplotlib.lines.Line2D at 0x29a8e3aa8e0>],
'caps': [<matplotlib.lines.Line2D at 0x29a8e3aad90>,
<matplotlib.lines.Line2D at 0x29a8e3aae80>],
'boxes': [<matplotlib.lines.Line2D at 0x29a8d139c10>],
'medians': [<matplotlib.lines.Line2D at 0x29a8e3aa4f0>],
'fliers': [<matplotlib.lines.Line2D at 0x29a8e3aa6d0>],
'means': []}
plt.violinplot(df['Maths'])
{'bodies': [<matplotlib.collections.PolyCollection at 0x29a8ceb45b0>],
'cmaxes': <matplotlib.collections.LineCollection at 0x29a8ceb4e80>,
'cmins': <matplotlib.collections.LineCollection at 0x29a8ceb4a00>,
'cbars': <matplotlib.collections.LineCollection at 0x29a8ceb4250>}
plt.violinplot(df['Chemistry'])
{'bodies': [<matplotlib.collections.PolyCollection at 0x29a8e78d3d0>],
'cmaxes': <matplotlib.collections.LineCollection at 0x29a8e78d520>,
'cmins': <matplotlib.collections.LineCollection at 0x29a8e5d9e20>,
'cbars': <matplotlib.collections.LineCollection at 0x29a8e5d9d00>}
s=np.array([24,56,43,67,89,90,96,76])
p=np.array([12,24,56,90,87,90,45,34])
plt.title('scatter plot')
plt.xlabel('p')
plt.ylabel('s')
plt.scatter(p,s)
plt.show()
sales=[100,200,300,400,500]
fruitname=['Apple','Mango','Kiwi','Grapes','papaya']
plt.title('Sales ')
plt.xlabel('fruit')
plt.ylabel('sale')
plt.scatter(fruitname,sales)
plt.show()
x=df['Maths']
y=df['English']
plt.scatter(x,y)
plt.xlabel('Maths')
plt.ylabel('Engish')
plt.show()
df2=pd.read_csv('Head_Brain_csv')
print(df2)
df2.columns
Gender AgeRange HeadSize BrainWeight 0 1 1 4512 1530 1 1 1 3738 1297 2 1 1 4261 1335 3 1 1 3777 1282 4 1 1 4177 1590 .. ... ... ... ... 232 2 2 3214 1110 233 2 2 3394 1215 234 2 2 3233 1104 235 2 2 3352 1170 236 2 2 3391 1120 [237 rows x 4 columns]
Index(['Gender', 'AgeRange', 'HeadSize', 'BrainWeight'], dtype='object')
df2['HeadSize']
0 4512
1 3738
2 4261
3 3777
4 4177
...
232 3214
233 3394
234 3233
235 3352
236 3391
Name: HeadSize, Length: 237, dtype: int64
df2['BrainWeight']
0 1530
1 1297
2 1335
3 1282
4 1590
...
232 1110
233 1215
234 1104
235 1170
236 1120
Name: BrainWeight, Length: 237, dtype: int64
x=df2['HeadSize']
y=df2['BrainWeight']
plt.scatter(x,y)
plt.xlabel('HeadSize')
plt.ylabel('BrainWeight')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
a=np.array([1,2,3,4,5,6])
b=np.array([2,3,4,5,6,7])
sns.stripplot(a,b)
<AxesSubplot:>
df4=pd.read_csv('Head_Brain_csv')
df4
| Gender | AgeRange | HeadSize | BrainWeight | |
|---|---|---|---|---|
| 0 | 1 | 1 | 4512 | 1530 |
| 1 | 1 | 1 | 3738 | 1297 |
| 2 | 1 | 1 | 4261 | 1335 |
| 3 | 1 | 1 | 3777 | 1282 |
| 4 | 1 | 1 | 4177 | 1590 |
| ... | ... | ... | ... | ... |
| 232 | 2 | 2 | 3214 | 1110 |
| 233 | 2 | 2 | 3394 | 1215 |
| 234 | 2 | 2 | 3233 | 1104 |
| 235 | 2 | 2 | 3352 | 1170 |
| 236 | 2 | 2 | 3391 | 1120 |
237 rows × 4 columns
sns.violinplot?
sns.violinplot(x='BrainWeight',data=df4)
<AxesSubplot:xlabel='BrainWeight'>
sns.violinplot(x='AgeRange',data=df4)
<AxesSubplot:xlabel='AgeRange'>
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('eployee.csv1')
df
| SNo | Name | Age | City | Country | Salary | |
|---|---|---|---|---|---|---|
| 0 | 1 | Tom | 28.0 | Toronto | Canada | 20000.0 |
| 1 | 2 | Lee | 31.0 | Edmonto | Canada | 3900.0 |
| 2 | 3 | Dave | 34.0 | Toronto | Canada | 8000.0 |
| 3 | 4 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 4 | 5 | kamal | 30.0 | NaN | America | NaN |
| 5 | 6 | Geet | NaN | HongKong | Asia | 30007.0 |
| 6 | 7 | Steven | 43.0 | BayArea | America | 8300.0 |
| 7 | 8 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 8 | 9 | hari | 50.0 | newyork | America | 67666.0 |
| 9 | 11 | yami | NaN | newyork | America | 8888.0 |
| 10 | 12 | Anik | 26.0 | HongKong | Asia | 66672.0 |
| 11 | 13 | kamal | 30.0 | NaN | America | 34344.0 |
| 12 | 14 | Geet | 22.0 | HongKong | Asia | 30007.0 |
| 13 | 15 | Steven | 43.0 | NaN | America | 8300.0 |
| 14 | 16 | Ram | 38.0 | Hyderabad | Asia | 54666.0 |
| 15 | 17 | hari | NaN | newyork | America | 67666.0 |
| 16 | 18 | yami | 27.0 | newyork | America | 8888.0 |
| 17 | 19 | Lee | 31.0 | Edmonto | Canada | NaN |
| 18 | 20 | Andrew | 34.0 | Congo | Africa | 34432.0 |
| 19 | 21 | soham | 32.0 | Angola | Africa | 34534.0 |
| 20 | 22 | Grig | 26.0 | Finland | Europe | 33451.0 |
| 21 | 23 | Kiara | 27.0 | London | Europe | NaN |
sns.violinplot(x='Age',data=df)
<AxesSubplot:xlabel='Age'>
sns.violinplot(x='Salary',data=df)
<AxesSubplot:xlabel='Salary'>
df2=pd.read_csv('Head_Brain_csv')
df2
| Gender | AgeRange | HeadSize | BrainWeight | |
|---|---|---|---|---|
| 0 | 1 | 1 | 4512 | 1530 |
| 1 | 1 | 1 | 3738 | 1297 |
| 2 | 1 | 1 | 4261 | 1335 |
| 3 | 1 | 1 | 3777 | 1282 |
| 4 | 1 | 1 | 4177 | 1590 |
| ... | ... | ... | ... | ... |
| 232 | 2 | 2 | 3214 | 1110 |
| 233 | 2 | 2 | 3394 | 1215 |
| 234 | 2 | 2 | 3233 | 1104 |
| 235 | 2 | 2 | 3352 | 1170 |
| 236 | 2 | 2 | 3391 | 1120 |
237 rows × 4 columns
sns.violinplot(x='BrainWeight',data=df2)
<AxesSubplot:xlabel='BrainWeight'>
dfiris=pd.read_csv('iris.csv')
dfiris
| sepal length | sepal width | petal length | petal width | class | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
sns.stripplot(x='class',y='sepal length',data=dfiris)
<AxesSubplot:xlabel='class', ylabel='sepal length'>
sns.stripplot(x='class',y='petal length',data=dfiris)
<AxesSubplot:xlabel='class', ylabel='petal length'>
sns.stripplot(x='class',y='sepal width',data=dfiris)
<AxesSubplot:xlabel='class', ylabel='sepal width'>
sns.stripplot(x='class',y='petal width',data=dfiris)
<AxesSubplot:xlabel='class', ylabel='petal width'>
sns.swarmplot(x='class',y='petal width',data=dfiris)
<AxesSubplot:xlabel='class', ylabel='petal width'>
sns.swarmplot(x='class',y='sepal length',data=dfiris)
<AxesSubplot:xlabel='class', ylabel='sepal length'>
dfiris
| sepal length | sepal width | petal length | petal width | class | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
dfiris['class'].value_counts()
Iris-setosa 50 Iris-versicolor 50 Iris-virginica 50 Name: class, dtype: int64
sns.countplot(dfiris['class'])
<AxesSubplot:xlabel='class', ylabel='count'>
sns.get_dataset_names()
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic', 'anagrams', 'anagrams', 'anscombe', 'anscombe', 'attention', 'attention', 'brain_networks', 'brain_networks', 'car_crashes', 'car_crashes', 'diamonds', 'diamonds', 'dots', 'dots', 'dowjones', 'dowjones', 'exercise', 'exercise', 'flights', 'flights', 'fmri', 'fmri', 'geyser', 'geyser', 'glue', 'glue', 'healthexp', 'healthexp', 'iris', 'iris', 'mpg', 'mpg', 'penguins', 'penguins', 'planets', 'planets', 'seaice', 'seaice', 'taxis', 'taxis', 'tips', 'tips', 'titanic', 'titanic', 'anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']
titanicds=sns.load_dataset('titanic')
titanicds
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
891 rows × 15 columns
titanicds.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
'alive', 'alone'],
dtype='object')
titanicds['class']
0 Third
1 First
2 Third
3 First
4 Third
...
886 Second
887 First
888 Third
889 First
890 Third
Name: class, Length: 891, dtype: category
Categories (3, object): ['First', 'Second', 'Third']
titanicds['class'].value_counts()
Third 491 First 216 Second 184 Name: class, dtype: int64
sns.countplot(titanicds['class'])
<AxesSubplot:xlabel='class', ylabel='count'>
sns.catplot(x="class",y="survived",data=titanicds,kind="bar")
<seaborn.axisgrid.FacetGrid at 0x1c96b9256a0>
sns.catplot(x="class",y="survived",hue="sex",data=titanicds,kind="bar")
<seaborn.axisgrid.FacetGrid at 0x1c96ca6b790>
help(sns.catplot)
Help on function catplot in module seaborn.categorical:
catplot(*, x=None, y=None, hue=None, data=None, row=None, col=None, col_wrap=None, estimator=<function mean at 0x000001C965F9F5E0>, ci=95, n_boot=1000, units=None, seed=None, order=None, hue_order=None, row_order=None, col_order=None, kind='strip', height=5, aspect=1, orient=None, color=None, palette=None, legend=True, legend_out=True, sharex=True, sharey=True, margin_titles=False, facet_kws=None, **kwargs)
Figure-level interface for drawing categorical plots onto a FacetGrid.
This function provides access to several axes-level functions that
show the relationship between a numerical and one or more categorical
variables using one of several visual representations. The ``kind``
parameter selects the underlying axes-level function to use:
Categorical scatterplots:
- :func:`stripplot` (with ``kind="strip"``; the default)
- :func:`swarmplot` (with ``kind="swarm"``)
Categorical distribution plots:
- :func:`boxplot` (with ``kind="box"``)
- :func:`violinplot` (with ``kind="violin"``)
- :func:`boxenplot` (with ``kind="boxen"``)
Categorical estimate plots:
- :func:`pointplot` (with ``kind="point"``)
- :func:`barplot` (with ``kind="bar"``)
- :func:`countplot` (with ``kind="count"``)
Extra keyword arguments are passed to the underlying function, so you
should refer to the documentation for each to see kind-specific options.
Note that unlike when using the axes-level functions directly, data must be
passed in a long-form DataFrame with variables specified by passing strings
to ``x``, ``y``, ``hue``, etc.
As in the case with the underlying plot functions, if variables have a
``categorical`` data type, the levels of the categorical variables, and
their order will be inferred from the objects. Otherwise you may have to
use alter the dataframe sorting or use the function parameters (``orient``,
``order``, ``hue_order``, etc.) to set up the plot correctly.
This function always treats one of the variables as categorical and
draws data at ordinal positions (0, 1, ... n) on the relevant axis, even
when the data has a numeric or date type.
See the :ref:`tutorial <categorical_tutorial>` for more information.
After plotting, the :class:`FacetGrid` with the plot is returned and can
be used directly to tweak supporting plot details or add other layers.
Parameters
----------
x, y, hue : names of variables in ``data``
Inputs for plotting long-form data. See examples for interpretation.
data : DataFrame
Long-form (tidy) dataset for plotting. Each column should correspond
to a variable, and each row should correspond to an observation.
row, col : names of variables in ``data``, optional
Categorical variables that will determine the faceting of the grid.
col_wrap : int
"Wrap" the column variable at this width, so that the column facets
span multiple rows. Incompatible with a ``row`` facet.
estimator : callable that maps vector -> scalar, optional
Statistical function to estimate within each categorical bin.
ci : float or "sd" or None, optional
Size of confidence intervals to draw around estimated values. If
"sd", skip bootstrapping and draw the standard deviation of the
observations. If ``None``, no bootstrapping will be performed, and
error bars will not be drawn.
n_boot : int, optional
Number of bootstrap iterations to use when computing confidence
intervals.
units : name of variable in ``data`` or vector data, optional
Identifier of sampling units, which will be used to perform a
multilevel bootstrap and account for repeated measures design.
seed : int, numpy.random.Generator, or numpy.random.RandomState, optional
Seed or random number generator for reproducible bootstrapping.
order, hue_order : lists of strings, optional
Order to plot the categorical levels in, otherwise the levels are
inferred from the data objects.
row_order, col_order : lists of strings, optional
Order to organize the rows and/or columns of the grid in, otherwise the
orders are inferred from the data objects.
kind : str, optional
The kind of plot to draw, corresponds to the name of a categorical
axes-level plotting function. Options are: "strip", "swarm", "box", "violin",
"boxen", "point", "bar", or "count".
height : scalar
Height (in inches) of each facet. See also: ``aspect``.
aspect : scalar
Aspect ratio of each facet, so that ``aspect * height`` gives the width
of each facet in inches.
orient : "v" | "h", optional
Orientation of the plot (vertical or horizontal). This is usually
inferred based on the type of the input variables, but it can be used
to resolve ambiguity when both `x` and `y` are numeric or when
plotting wide-form data.
color : matplotlib color, optional
Color for all of the elements, or seed for a gradient palette.
palette : palette name, list, or dict
Colors to use for the different levels of the ``hue`` variable. Should
be something that can be interpreted by :func:`color_palette`, or a
dictionary mapping hue levels to matplotlib colors.
legend : bool, optional
If ``True`` and there is a ``hue`` variable, draw a legend on the plot.
legend_out : bool
If ``True``, the figure size will be extended, and the legend will be
drawn outside the plot on the center right.
share{x,y} : bool, 'col', or 'row' optional
If true, the facets will share y axes across columns and/or x axes
across rows.
margin_titles : bool
If ``True``, the titles for the row variable are drawn to the right of
the last column. This option is experimental and may not work in all
cases.
facet_kws : dict, optional
Dictionary of other keyword arguments to pass to :class:`FacetGrid`.
kwargs : key, value pairings
Other keyword arguments are passed through to the underlying plotting
function.
Returns
-------
g : :class:`FacetGrid`
Returns the :class:`FacetGrid` object with the plot on it for further
tweaking.
Examples
--------
Draw a single facet to use the :class:`FacetGrid` legend placement:
.. plot::
:context: close-figs
>>> import seaborn as sns
>>> sns.set_theme(style="ticks")
>>> exercise = sns.load_dataset("exercise")
>>> g = sns.catplot(x="time", y="pulse", hue="kind", data=exercise)
Use a different plot kind to visualize the same data:
.. plot::
:context: close-figs
>>> g = sns.catplot(x="time", y="pulse", hue="kind",
... data=exercise, kind="violin")
Facet along the columns to show a third categorical variable:
.. plot::
:context: close-figs
>>> g = sns.catplot(x="time", y="pulse", hue="kind",
... col="diet", data=exercise)
Use a different height and aspect ratio for the facets:
.. plot::
:context: close-figs
>>> g = sns.catplot(x="time", y="pulse", hue="kind",
... col="diet", data=exercise,
... height=5, aspect=.8)
Make many column facets and wrap them into the rows of the grid:
.. plot::
:context: close-figs
>>> titanic = sns.load_dataset("titanic")
>>> g = sns.catplot(x="alive", col="deck", col_wrap=4,
... data=titanic[titanic.deck.notnull()],
... kind="count", height=2.5, aspect=.8)
Plot horizontally and pass other keyword arguments to the plot function:
.. plot::
:context: close-figs
>>> g = sns.catplot(x="age", y="embark_town",
... hue="sex", row="class",
... data=titanic[titanic.embark_town.notnull()],
... orient="h", height=2, aspect=3, palette="Set3",
... kind="violin", dodge=True, cut=0, bw=.2)
Use methods on the returned :class:`FacetGrid` to tweak the presentation:
.. plot::
:context: close-figs
>>> g = sns.catplot(x="who", y="survived", col="class",
... data=titanic, saturation=.5,
... kind="bar", ci=None, aspect=.6)
>>> (g.set_axis_labels("", "Survival Rate")
... .set_xticklabels(["Men", "Women", "Children"])
... .set_titles("{col_name} {col_var}")
... .set(ylim=(0, 1))
... .despine(left=True)) #doctest: +ELLIPSIS
<seaborn.axisgrid.FacetGrid object at 0x...>
sns.catplot?
sns.catplot(x="class",y="survived",hue="sex",data=titanicds,kind="violin")
<seaborn.axisgrid.FacetGrid at 0x1c96e34d4c0>
sns.catplot(x="class",y="survived",hue="sex",data=titanicds,kind="strip")
<seaborn.axisgrid.FacetGrid at 0x1c96e40d0d0>
tipsds=sns.load_dataset('tips')
tipsds
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
tipsds=sns.violinplot(x='total_bill',data=tipsds)
tipsds
<AxesSubplot:xlabel='total_bill'>
sns.violinplot(x='tip',data=tipsds)
<AxesSubplot:xlabel='tip'>
tipsds['day'].value_counts()
Sat 87 Sun 76 Thur 62 Fri 19 Name: day, dtype: int64
sns.countplot(tipsds['day'])
<AxesSubplot:xlabel='day', ylabel='count'>
tipsds['smoker'].value_counts()
No 151 Yes 93 Name: smoker, dtype: int64
sns.countplot(tipsds['smoker'])
<AxesSubplot:xlabel='smoker', ylabel='count'>
sns.boxplot(tipsds['total_bill'])
<AxesSubplot:xlabel='total_bill'>
sns.histplot(data=tipsds,x='total_bill',bins=20)
<AxesSubplot:xlabel='total_bill', ylabel='Count'>
sns.histplot(data=tipsds,x='tip',bins=20)
<AxesSubplot:xlabel='tip', ylabel='Count'>
sns.distplot(tipsds['tip'],bins=10)
<AxesSubplot:xlabel='tip', ylabel='Density'>
sns.distplot(tipsds['total_bill'],kde=False)
<AxesSubplot:xlabel='total_bill'>
sns.distplot(tipsds['total_bill'],kde=True)
<AxesSubplot:xlabel='total_bill', ylabel='Density'>
sns.kdeplot(tipsds['total_bill'])
<AxesSubplot:xlabel='total_bill', ylabel='Density'>
sns.jointplot(data=tipsds,x='total_bill',y='tip')
<seaborn.axisgrid.JointGrid at 0x1c96fc18f40>
sns.rugplot(data=tipsds,x='tip')
<AxesSubplot:xlabel='tip'>
sns.rugplot(data=tipsds,x='total_bill',y='tip')
<AxesSubplot:xlabel='total_bill', ylabel='tip'>
tipsds
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
sns.boxplot(x='day',y='total_bill',data=tipsds)
<AxesSubplot:xlabel='day', ylabel='total_bill'>
sns.boxplot(x='day',y='total_bill',hue='sex',data=tipsds)
<AxesSubplot:xlabel='day', ylabel='total_bill'>
sns.boxplot(x='day',y='total_bill',hue='smoker',data=tipsds)
<AxesSubplot:xlabel='day', ylabel='total_bill'>
sns.pairplot(tipsds)
<seaborn.axisgrid.PairGrid at 0x1c970137520>
sns.scatterplot(x='total_bill',y='tip',data=tipsds)
<AxesSubplot:xlabel='total_bill', ylabel='tip'>
mpgds=sns.load_dataset('mpg')
mpgds
| mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
| 394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
| 395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
| 396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
| 397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
398 rows × 9 columns
sns.scatterplot(data=mpgds,x='mpg',y='weight')
<AxesSubplot:xlabel='mpg', ylabel='weight'>
sns.relplot(data=mpgds,x='mpg',y='weight')
<seaborn.axisgrid.FacetGrid at 0x1c971d938b0>
sns.lineplot(data=mpgds,x='mpg',y='weight')
<AxesSubplot:xlabel='mpg', ylabel='weight'>
sns.boxplot(x='weight',data=mpgds)
<AxesSubplot:xlabel='weight'>
sns.boxplot(x='displacement',data=mpgds)
<AxesSubplot:xlabel='displacement'>
sns.rugplot(x='weight',y='mpg',data=mpgds)
<AxesSubplot:xlabel='weight', ylabel='mpg'>
sns.jointplot(data=mpgds,x='weight',y='mpg')
<seaborn.axisgrid.JointGrid at 0x1c972021580>
sns.jointplot(data=mpgds,x='weight',y='mpg',kind='kde')
<seaborn.axisgrid.JointGrid at 0x1c972115fa0>
sns.jointplot(data=mpgds,x='weight',y='mpg',kind='hex')
<seaborn.axisgrid.JointGrid at 0x1c971a7d760>
sns.stripplot(data=mpgds,x='origin',y='mpg')
<AxesSubplot:xlabel='origin', ylabel='mpg'>
sns.swarmplot(data=mpgds,x='origin',y='mpg')
<AxesSubplot:xlabel='origin', ylabel='mpg'>
sns.catplot(data=mpgds,x='origin',y='mpg',kind='box')
<seaborn.axisgrid.FacetGrid at 0x1c974308f70>
sns.get_dataset_names()
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic', 'anagrams', 'anagrams', 'anscombe', 'anscombe', 'attention', 'attention', 'brain_networks', 'brain_networks', 'car_crashes', 'car_crashes', 'diamonds', 'diamonds', 'dots', 'dots', 'dowjones', 'dowjones', 'exercise', 'exercise', 'flights', 'flights', 'fmri', 'fmri', 'geyser', 'geyser', 'glue', 'glue', 'healthexp', 'healthexp', 'iris', 'iris', 'mpg', 'mpg', 'penguins', 'penguins', 'planets', 'planets', 'seaice', 'seaice', 'taxis', 'taxis', 'tips', 'tips', 'titanic', 'titanic', 'anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']
taxi=sns.load_dataset('taxis')
taxi
| pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan |
| 1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
| 2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan |
| 3 | 2019-03-10 01:23:59 | 2019-03-10 01:49:51 | 1 | 7.70 | 27.0 | 6.15 | 0.0 | 36.95 | yellow | credit card | Hudson Sq | Yorkville West | Manhattan | Manhattan |
| 4 | 2019-03-30 13:27:42 | 2019-03-30 13:37:14 | 3 | 2.16 | 9.0 | 1.10 | 0.0 | 13.40 | yellow | credit card | Midtown East | Yorkville West | Manhattan | Manhattan |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6428 | 2019-03-31 09:51:53 | 2019-03-31 09:55:27 | 1 | 0.75 | 4.5 | 1.06 | 0.0 | 6.36 | green | credit card | East Harlem North | Central Harlem North | Manhattan | Manhattan |
| 6429 | 2019-03-31 17:38:00 | 2019-03-31 18:34:23 | 1 | 18.74 | 58.0 | 0.00 | 0.0 | 58.80 | green | credit card | Jamaica | East Concourse/Concourse Village | Queens | Bronx |
| 6430 | 2019-03-23 22:55:18 | 2019-03-23 23:14:25 | 1 | 4.14 | 16.0 | 0.00 | 0.0 | 17.30 | green | cash | Crown Heights North | Bushwick North | Brooklyn | Brooklyn |
| 6431 | 2019-03-04 10:09:25 | 2019-03-04 10:14:29 | 1 | 1.12 | 6.0 | 0.00 | 0.0 | 6.80 | green | credit card | East New York | East Flatbush/Remsen Village | Brooklyn | Brooklyn |
| 6432 | 2019-03-13 19:31:22 | 2019-03-13 19:48:02 | 1 | 3.85 | 15.0 | 3.36 | 0.0 | 20.16 | green | credit card | Boerum Hill | Windsor Terrace | Brooklyn | Brooklyn |
6433 rows × 14 columns
sns.scatterplot(x='fare',y='distance',data=taxi)
<AxesSubplot:xlabel='fare', ylabel='distance'>
taxi['color'].value_counts()
yellow 5451 green 982 Name: color, dtype: int64
sns.countplot(x='color',data=taxi)
<AxesSubplot:xlabel='color', ylabel='count'>
sns.stripplot(data=taxi,x='color',y='fare')
<AxesSubplot:xlabel='color', ylabel='fare'>
sns.boxplot(data=taxi,x='color',y='fare',hue='payment')
<AxesSubplot:xlabel='color', ylabel='fare'>
sns.boxplot(data=taxi,x='color',y='fare',hue='payment')
<AxesSubplot:xlabel='color', ylabel='fare'>
sns.lmplot('distance','fare',data=taxi,hue='payment')
<seaborn.axisgrid.FacetGrid at 0x1c9743a5fd0>
sns.lmplot('distance','fare',data=taxi,hue='payment',col='color')
<seaborn.axisgrid.FacetGrid at 0x1c974826250>
flight=sns.load_dataset('flights')
flight
| year | month | passengers | |
|---|---|---|---|
| 0 | 1949 | Jan | 112 |
| 1 | 1949 | Feb | 118 |
| 2 | 1949 | Mar | 132 |
| 3 | 1949 | Apr | 129 |
| 4 | 1949 | May | 121 |
| ... | ... | ... | ... |
| 139 | 1960 | Aug | 606 |
| 140 | 1960 | Sep | 508 |
| 141 | 1960 | Oct | 461 |
| 142 | 1960 | Nov | 390 |
| 143 | 1960 | Dec | 432 |
144 rows × 3 columns
import seaborn as sns
flight=flight.pivot('month','year','passengers')
flight
| year | 1949 | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | 1958 | 1959 | 1960 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| month | ||||||||||||
| Jan | 112 | 115 | 145 | 171 | 196 | 204 | 242 | 284 | 315 | 340 | 360 | 417 |
| Feb | 118 | 126 | 150 | 180 | 196 | 188 | 233 | 277 | 301 | 318 | 342 | 391 |
| Mar | 132 | 141 | 178 | 193 | 236 | 235 | 267 | 317 | 356 | 362 | 406 | 419 |
| Apr | 129 | 135 | 163 | 181 | 235 | 227 | 269 | 313 | 348 | 348 | 396 | 461 |
| May | 121 | 125 | 172 | 183 | 229 | 234 | 270 | 318 | 355 | 363 | 420 | 472 |
| Jun | 135 | 149 | 178 | 218 | 243 | 264 | 315 | 374 | 422 | 435 | 472 | 535 |
| Jul | 148 | 170 | 199 | 230 | 264 | 302 | 364 | 413 | 465 | 491 | 548 | 622 |
| Aug | 148 | 170 | 199 | 242 | 272 | 293 | 347 | 405 | 467 | 505 | 559 | 606 |
| Sep | 136 | 158 | 184 | 209 | 237 | 259 | 312 | 355 | 404 | 404 | 463 | 508 |
| Oct | 119 | 133 | 162 | 191 | 211 | 229 | 274 | 306 | 347 | 359 | 407 | 461 |
| Nov | 104 | 114 | 146 | 172 | 180 | 203 | 237 | 271 | 305 | 310 | 362 | 390 |
| Dec | 118 | 140 | 166 | 194 | 201 | 229 | 278 | 306 | 336 | 337 | 405 | 432 |
ax=sns.heatmap(flight)
ax
<AxesSubplot:xlabel='year', ylabel='month'>
import numpy as np
import pandas as pd
Numpy Statistics
a=np.array([24,65,87,98,90,45,70,80,64,90])
a
array([24, 65, 87, 98, 90, 45, 70, 80, 64, 90])
np.mean(a)
71.3
np.median(a)
75.0
np.max(a)
98
np.min(a)
24
rang=np.max(a)-np.min(a)
rang
74
discriptive statistics
#Variance
avar=np.var(a)
avar
475.81000000000006
#standard deviation
np.std(a)
21.813069476806792
a
array([24, 65, 87, 98, 90, 45, 70, 80, 64, 90])
# Percentile
np.percentile(a,60)
82.8
np.percentile(a,40)
68.0
np.percentile(a,75)
89.25
Pandas statistics
df=pd.read_csv('student_mark.csv')
df
| Unnamed: 0 | Gender | DOB | Maths | Physics | Chemistry | English | Biology | Economics | History | Civics | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | John | M | 05-04-1988 | 55 | 45 | 56 | 87 | 21 | 52 | 89 | 65 |
| 1 | Suresh | M | 04-05-1987 | 75 | 96 | 78 | 64 | 90 | 61 | 58 | 2 |
| 2 | Ramesh | M | 25-05-1989 | 25 | 54 | 89 | 76 | 95 | 87 | 56 | 74 |
| 3 | Jessica | F | 12-08-1990 | 78 | 96 | 86 | 63 | 54 | 89 | 75 | 45 |
| 4 | Jennifer | F | 02-09-1989 | 58 | 96 | 78 | 46 | 96 | 77 | 83 | 53 |
| 5 | Annu | F | 05-04-1988 | 45 | 87 | 52 | 89 | 55 | 89 | 87 | 52 |
| 6 | pooja | F | 04-05-1987 | 55 | 64 | 61 | 58 | 75 | 58 | 64 | 61 |
| 7 | Ritesh | M | 25-05-1989 | 54 | 76 | 87 | 56 | 25 | 56 | 76 | 87 |
| 8 | Farha | F | 12-08-1990 | 55 | 63 | 89 | 75 | 78 | 75 | 63 | 89 |
| 9 | Mukesh | M | 02-09-1989 | 96 | 46 | 77 | 83 | 58 | 83 | 46 | 77 |
df['Maths']
0 55 1 75 2 25 3 78 4 58 5 45 6 55 7 54 8 55 9 96 Name: Maths, dtype: int64
df['Maths'].mean()
59.6
df['Maths'].median()
55.0
df['Chemistry'].mean()
75.3
df['Chemistry'].median()
78.0
df['Chemistry'].mode()
0 78 1 89 Name: Chemistry, dtype: int64
df['Maths'].max()
96
df['Maths'].min()
25
rang=df['Maths'].max()-df['Maths'].min()
rang
71
mathv=df['Maths'].var()
mathv
378.71111111111105
e=df['Maths'].std()
e
19.46050130677807
import scipy as st
import warnings
warnings.filterwarnings('ignore')
a
array([24, 65, 87, 98, 90, 45, 70, 80, 64, 90])
st.mean(a)
71.3
st.median(a)
75.0
d=st.var(a)
d
475.81000000000006
round(d,2)
475.81
round(234567.0987654,3)
234567.099
import scipy.stats as st
b=np.array([4,4,5,46,4,3,24,67,543,43,234,54,32,23,5,6,7,7,8,88,8,8,8,8,8,8,8,8,8,8,8,7,65,4,3,2,2,])
b
array([ 4, 4, 5, 46, 4, 3, 24, 67, 543, 43, 234, 54, 32,
23, 5, 6, 7, 7, 8, 88, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 7, 65, 4, 3, 2, 2])
st.mode(b)
ModeResult(mode=array([8]), count=array([12]))
df
| Unnamed: 0 | Gender | DOB | Maths | Physics | Chemistry | English | Biology | Economics | History | Civics | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | John | M | 05-04-1988 | 55 | 45 | 56 | 87 | 21 | 52 | 89 | 65 |
| 1 | Suresh | M | 04-05-1987 | 75 | 96 | 78 | 64 | 90 | 61 | 58 | 2 |
| 2 | Ramesh | M | 25-05-1989 | 25 | 54 | 89 | 76 | 95 | 87 | 56 | 74 |
| 3 | Jessica | F | 12-08-1990 | 78 | 96 | 86 | 63 | 54 | 89 | 75 | 45 |
| 4 | Jennifer | F | 02-09-1989 | 58 | 96 | 78 | 46 | 96 | 77 | 83 | 53 |
| 5 | Annu | F | 05-04-1988 | 45 | 87 | 52 | 89 | 55 | 89 | 87 | 52 |
| 6 | pooja | F | 04-05-1987 | 55 | 64 | 61 | 58 | 75 | 58 | 64 | 61 |
| 7 | Ritesh | M | 25-05-1989 | 54 | 76 | 87 | 56 | 25 | 56 | 76 | 87 |
| 8 | Farha | F | 12-08-1990 | 55 | 63 | 89 | 75 | 78 | 75 | 63 | 89 |
| 9 | Mukesh | M | 02-09-1989 | 96 | 46 | 77 | 83 | 58 | 83 | 46 | 77 |
df.describe()
| Maths | Physics | Chemistry | English | Biology | Economics | History | Civics | |
|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.0000 |
| mean | 59.600000 | 72.300000 | 75.300000 | 69.700000 | 64.700000 | 72.700000 | 69.700000 | 60.5000 |
| std | 19.460501 | 20.661559 | 14.000397 | 14.453373 | 26.998148 | 14.629119 | 14.453373 | 25.3432 |
| min | 25.000000 | 45.000000 | 52.000000 | 46.000000 | 21.000000 | 52.000000 | 46.000000 | 2.0000 |
| 25% | 54.250000 | 56.250000 | 65.000000 | 59.250000 | 54.250000 | 58.750000 | 59.250000 | 52.2500 |
| 50% | 55.000000 | 70.000000 | 78.000000 | 69.500000 | 66.500000 | 76.000000 | 69.500000 | 63.0000 |
| 75% | 70.750000 | 93.750000 | 86.750000 | 81.250000 | 87.000000 | 86.000000 | 81.250000 | 76.2500 |
| max | 96.000000 | 96.000000 | 89.000000 | 89.000000 | 96.000000 | 89.000000 | 89.000000 | 89.0000 |
import scipy
from scipy import stats
from scipy.stats import percentileofscore
percentileofscore(df['Maths'],25)
10.0
percentileofscore(df['Maths'],60)
70.0
percentileofscore(df['English'],75)
60.0
Pandas Percentile
q1=df['Maths'].quantile(0.25)#25
q1
54.25
#variance
df['Maths'].std()
19.46050130677807
#standard deviation
df['Maths'].std()
19.46050130677807
print(df['English'].var())
print(df['English'].std())
208.89999999999998 14.453373308677804
#to call out outliers or abnormal data
from scipy.stats import zscore
zscore([43,45,67,87,87,28,90,56,])
array([-0.90075228, -0.81011054, 0.18694859, 1.09336597, 1.09336597,
-1.58056532, 1.22932858, -0.31158098])
import pandas as pd
from scipy import stats
data=pd.read_csv('brain_size.csv',sep=';',na_values=".")
data
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 133 | 132 | 124 | 118.0 | 64.5 | 816932 |
| 1 | 2 | Male | 140 | 150 | 124 | NaN | 72.5 | 1001121 |
| 2 | 3 | Male | 139 | 123 | 150 | 143.0 | 73.3 | 1038437 |
| 3 | 4 | Male | 133 | 129 | 128 | 172.0 | 68.8 | 965353 |
| 4 | 5 | Female | 137 | 132 | 134 | 147.0 | 65.0 | 951545 |
| 5 | 6 | Female | 99 | 90 | 110 | 146.0 | 69.0 | 928799 |
| 6 | 7 | Female | 138 | 136 | 131 | 138.0 | 64.5 | 991305 |
| 7 | 8 | Female | 92 | 90 | 98 | 175.0 | 66.0 | 854258 |
| 8 | 9 | Male | 89 | 93 | 84 | 134.0 | 66.3 | 904858 |
| 9 | 10 | Male | 133 | 114 | 147 | 172.0 | 68.8 | 955466 |
| 10 | 11 | Female | 132 | 129 | 124 | 118.0 | 64.5 | 833868 |
| 11 | 12 | Male | 141 | 150 | 128 | 151.0 | 70.0 | 1079549 |
| 12 | 13 | Male | 135 | 129 | 124 | 155.0 | 69.0 | 924059 |
| 13 | 14 | Female | 140 | 120 | 147 | 155.0 | 70.5 | 856472 |
| 14 | 15 | Female | 96 | 100 | 90 | 146.0 | 66.0 | 878897 |
| 15 | 16 | Female | 83 | 71 | 96 | 135.0 | 68.0 | 865363 |
| 16 | 17 | Female | 132 | 132 | 120 | 127.0 | 68.5 | 852244 |
| 17 | 18 | Male | 100 | 96 | 102 | 178.0 | 73.5 | 945088 |
| 18 | 19 | Female | 101 | 112 | 84 | 136.0 | 66.3 | 808020 |
| 19 | 20 | Male | 80 | 77 | 86 | 180.0 | 70.0 | 889083 |
| 20 | 21 | Male | 83 | 83 | 86 | NaN | NaN | 892420 |
| 21 | 22 | Male | 97 | 107 | 84 | 186.0 | 76.5 | 905940 |
| 22 | 23 | Female | 135 | 129 | 134 | 122.0 | 62.0 | 790619 |
| 23 | 24 | Male | 139 | 145 | 128 | 132.0 | 68.0 | 955003 |
| 24 | 25 | Female | 91 | 86 | 102 | 114.0 | 63.0 | 831772 |
| 25 | 26 | Male | 141 | 145 | 131 | 171.0 | 72.0 | 935494 |
| 26 | 27 | Female | 85 | 90 | 84 | 140.0 | 68.0 | 798612 |
| 27 | 28 | Male | 103 | 96 | 110 | 187.0 | 77.0 | 1062462 |
| 28 | 29 | Female | 77 | 83 | 72 | 106.0 | 63.0 | 793549 |
| 29 | 30 | Female | 130 | 126 | 124 | 159.0 | 66.5 | 866662 |
| 30 | 31 | Female | 133 | 126 | 132 | 127.0 | 62.5 | 857782 |
| 31 | 32 | Male | 144 | 145 | 137 | 191.0 | 67.0 | 949589 |
| 32 | 33 | Male | 103 | 96 | 110 | 192.0 | 75.5 | 997925 |
| 33 | 34 | Male | 90 | 96 | 86 | 181.0 | 69.0 | 879987 |
| 34 | 35 | Female | 83 | 90 | 81 | 143.0 | 66.5 | 834344 |
| 35 | 36 | Female | 133 | 129 | 128 | 153.0 | 66.5 | 948066 |
| 36 | 37 | Male | 140 | 150 | 124 | 144.0 | 70.5 | 949395 |
| 37 | 38 | Female | 88 | 86 | 94 | 139.0 | 64.5 | 893983 |
| 38 | 39 | Male | 81 | 90 | 74 | 148.0 | 74.0 | 930016 |
| 39 | 40 | Male | 89 | 91 | 89 | 179.0 | 75.5 | 935863 |
T-test
data['VIQ'].mean()
112.35
data[data['Gender']=='Female']['VIQ'].mean()
109.45
stats.ttest_1samp(data['PIQ'],106)
Ttest_1sampResult(statistic=1.4143037747013467, pvalue=0.16520690047123543)
stats.ttest_1samp(data['PIQ'],108)
Ttest_1sampResult(statistic=0.8513967996958363, pvalue=0.39974665151472855)
female_viq=data[data['Gender']=='Female']['VIQ']
male_viq=data[data['Gender']=='Male']['VIQ']
data['FSIQ'].mean()
113.45
data['PIQ'].mean()
111.025
stats.ttest_ind(data['FSIQ'],data['PIQ'])
Ttest_indResult(statistic=0.465637596380964, pvalue=0.6427725009414841)
COFFICIENT CORRELATION
df=pd.read_csv('student_mark.csv')
df
| Unnamed: 0 | Gender | DOB | Maths | Physics | Chemistry | English | Biology | Economics | History | Civics | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | John | M | 05-04-1988 | 55 | 45 | 56 | 87 | 21 | 52 | 89 | 65 |
| 1 | Suresh | M | 04-05-1987 | 75 | 96 | 78 | 64 | 90 | 61 | 58 | 2 |
| 2 | Ramesh | M | 25-05-1989 | 25 | 54 | 89 | 76 | 95 | 87 | 56 | 74 |
| 3 | Jessica | F | 12-08-1990 | 78 | 96 | 86 | 63 | 54 | 89 | 75 | 45 |
| 4 | Jennifer | F | 02-09-1989 | 58 | 96 | 78 | 46 | 96 | 77 | 83 | 53 |
| 5 | Annu | F | 05-04-1988 | 45 | 87 | 52 | 89 | 55 | 89 | 87 | 52 |
| 6 | pooja | F | 04-05-1987 | 55 | 64 | 61 | 58 | 75 | 58 | 64 | 61 |
| 7 | Ritesh | M | 25-05-1989 | 54 | 76 | 87 | 56 | 25 | 56 | 76 | 87 |
| 8 | Farha | F | 12-08-1990 | 55 | 63 | 89 | 75 | 78 | 75 | 63 | 89 |
| 9 | Mukesh | M | 02-09-1989 | 96 | 46 | 77 | 83 | 58 | 83 | 46 | 77 |
df.iloc[:,3:-1]
| Maths | Physics | Chemistry | English | Biology | Economics | History | |
|---|---|---|---|---|---|---|---|
| 0 | 55 | 45 | 56 | 87 | 21 | 52 | 89 |
| 1 | 75 | 96 | 78 | 64 | 90 | 61 | 58 |
| 2 | 25 | 54 | 89 | 76 | 95 | 87 | 56 |
| 3 | 78 | 96 | 86 | 63 | 54 | 89 | 75 |
| 4 | 58 | 96 | 78 | 46 | 96 | 77 | 83 |
| 5 | 45 | 87 | 52 | 89 | 55 | 89 | 87 |
| 6 | 55 | 64 | 61 | 58 | 75 | 58 | 64 |
| 7 | 54 | 76 | 87 | 56 | 25 | 56 | 76 |
| 8 | 55 | 63 | 89 | 75 | 78 | 75 | 63 |
| 9 | 96 | 46 | 77 | 83 | 58 | 83 | 46 |
#correlation
dfc=df.corr()
dfc
| Maths | Physics | Chemistry | English | Biology | Economics | History | Civics | |
|---|---|---|---|---|---|---|---|---|
| Maths | 1.000000 | 0.113354 | 0.076751 | -0.064074 | -0.146598 | 0.000312 | -0.320846 | -0.274629 |
| Physics | 0.113354 | 1.000000 | 0.117192 | -0.562608 | 0.279638 | 0.191851 | 0.334082 | -0.643266 |
| Chemistry | 0.076751 | 0.117192 | 1.000000 | -0.409680 | 0.318032 | 0.235390 | -0.482710 | 0.210908 |
| English | -0.064074 | -0.562608 | -0.409680 | 1.000000 | -0.330844 | 0.237577 | -0.038775 | 0.186705 |
| Biology | -0.146598 | 0.279638 | 0.318032 | -0.330844 | 1.000000 | 0.370531 | -0.456131 | -0.330060 |
| Economics | 0.000312 | 0.191851 | 0.235390 | 0.237577 | 0.370531 | 1.000000 | -0.164953 | 0.043905 |
| History | -0.320846 | 0.334082 | -0.482710 | -0.038775 | -0.456131 | -0.164953 | 1.000000 | -0.048686 |
| Civics | -0.274629 | -0.643266 | 0.210908 | 0.186705 | -0.330060 | 0.043905 | -0.048686 | 1.000000 |
import seaborn as sns
sns.heatmap(dfc)
<AxesSubplot:>
sns.heatmap(dfc,cmap='Blues',annot=True)
<AxesSubplot:>
CHI SQUARE TEST -- Categorical variable Test of Independence-- Apply on string data
import numpy as np
import pandas as pd
import scipy.stats as stats
#ds=pd.read_csv('')
ANOVA TEST--Analysis of variances
from scipy.stats import f_oneway
marks_sectionA=[70,80,85,70,90]
marks_sectionB=[90,70,40,30,15]
marks_sectionc=[50,89,36,70,49]
f_oneway(marks_sectionA,marks_sectionB,marks_sectionc)
F_onewayResult(statistic=2.4330768697580933, pvalue=0.1297152831419187)
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
df=pd.read_csv('president._heights.csv')
df
| order | name | height | |
|---|---|---|---|
| 0 | 1 | George Washington | 189 |
| 1 | 2 | John Adams | 170 |
| 2 | 3 | Thomas Jefferson | 189 |
| 3 | 4 | James Madison | 163 |
| 4 | 5 | James Monroe | 183 |
| 5 | 6 | John Quincy Adams | 171 |
| 6 | 7 | Andrew Jackson | 185 |
| 7 | 8 | Martin Van Buren | 168 |
| 8 | 9 | William Henry Harrison | 173 |
| 9 | 10 | John Tyler | 183 |
| 10 | 11 | James K. Polk | 173 |
| 11 | 12 | Zachary Taylor | 173 |
| 12 | 13 | Millard Fillmore | 175 |
| 13 | 14 | Franklin Pierce | 178 |
| 14 | 15 | James Buchanan | 183 |
| 15 | 16 | Abraham Lincoln | 193 |
| 16 | 17 | Andrew Johnson | 178 |
| 17 | 18 | Ulysses S. Grant | 173 |
| 18 | 19 | Rutherford B. Hayes | 174 |
| 19 | 20 | James A. Garfield | 183 |
| 20 | 21 | Chester A. Arthur | 183 |
| 21 | 23 | Benjamin Harrison | 168 |
| 22 | 25 | William McKinley | 170 |
| 23 | 26 | Theodore Roosevelt | 178 |
| 24 | 27 | William Howard Taft | 182 |
| 25 | 28 | Woodrow Wilson | 180 |
| 26 | 29 | Warren G. Harding | 183 |
| 27 | 30 | Calvin Coolidge | 178 |
| 28 | 31 | Herbert Hoover | 182 |
| 29 | 32 | Franklin D. Roosevelt | 188 |
| 30 | 33 | Harry S. Truman | 175 |
| 31 | 34 | Dwight D. Eisenhower | 179 |
| 32 | 35 | John F. Kennedy | 183 |
| 33 | 36 | Lyndon B. Johnson | 193 |
| 34 | 37 | Richard Nixon | 182 |
| 35 | 38 | Gerald Ford | 183 |
| 36 | 39 | Jimmy Carter | 177 |
| 37 | 40 | Ronald Reagan | 185 |
| 38 | 41 | George H. W. Bush | 188 |
| 39 | 42 | Bill Clinton | 188 |
| 40 | 43 | George W. Bush | 182 |
| 41 | 44 | Barack Obama | 185 |
df.columns
Index(['order', 'name', 'height'], dtype='object')
plt.boxplot(df.height)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c969009c40>,
<matplotlib.lines.Line2D at 0x1c969009f10>],
'caps': [<matplotlib.lines.Line2D at 0x1c969027220>,
<matplotlib.lines.Line2D at 0x1c9690274f0>],
'boxes': [<matplotlib.lines.Line2D at 0x1c969009970>],
'medians': [<matplotlib.lines.Line2D at 0x1c9690277c0>],
'fliers': [<matplotlib.lines.Line2D at 0x1c969027a90>],
'means': []}
# no outliers
df['height'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
0.0 163.0 0.1 170.1 0.2 173.0 0.3 175.6 0.4 178.0 0.5 182.0 0.6 183.0 0.7 183.0 0.8 185.0 0.9 188.0 1.0 193.0 Name: height, dtype: float64
df2=pd.read_csv('HR_attriition_rate.csv')
df2.head()
| satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | Departments | salary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
df2.columns
Index(['satisfaction_level', 'last_evaluation', 'number_project',
'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
'promotion_last_5years', 'Departments ', 'salary'],
dtype='object')
plt.boxplot(df2.satisfaction_level)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c969258d90>,
<matplotlib.lines.Line2D at 0x1c9690680a0>],
'caps': [<matplotlib.lines.Line2D at 0x1c969068370>,
<matplotlib.lines.Line2D at 0x1c969068670>],
'boxes': [<matplotlib.lines.Line2D at 0x1c969258a90>],
'medians': [<matplotlib.lines.Line2D at 0x1c969068940>],
'fliers': [<matplotlib.lines.Line2D at 0x1c969068c10>],
'means': []}
df2['satisfaction_level'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
0.0 0.09 0.1 0.21 0.2 0.40 0.3 0.49 0.4 0.57 0.5 0.64 0.6 0.72 0.7 0.78 0.8 0.85 0.9 0.92 1.0 1.00 Name: satisfaction_level, dtype: float64
df3=pd.read_csv('brain_size.csv',sep=';')
df3.head()
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 133 | 132 | 124 | 118 | 64.5 | 816932 |
| 1 | 2 | Male | 140 | 150 | 124 | . | 72.5 | 1001121 |
| 2 | 3 | Male | 139 | 123 | 150 | 143 | 73.3 | 1038437 |
| 3 | 4 | Male | 133 | 129 | 128 | 172 | 68.8 | 965353 |
| 4 | 5 | Female | 137 | 132 | 134 | 147 | 65.0 | 951545 |
df3.columns
Index(['id', 'Gender', 'FSIQ', 'VIQ', 'PIQ', 'Weight', 'Height', 'MRI_Count'], dtype='object')
plt.boxplot(df3.MRI_Count)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c9690d0070>,
<matplotlib.lines.Line2D at 0x1c9690d02e0>],
'caps': [<matplotlib.lines.Line2D at 0x1c9690d05b0>,
<matplotlib.lines.Line2D at 0x1c9690d0880>],
'boxes': [<matplotlib.lines.Line2D at 0x1c9690c1d60>],
'medians': [<matplotlib.lines.Line2D at 0x1c9690d0b50>],
'fliers': [<matplotlib.lines.Line2D at 0x1c9690d0e20>],
'means': []}
plt.boxplot(df3.FSIQ)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c9691383d0>,
<matplotlib.lines.Line2D at 0x1c9691386a0>],
'caps': [<matplotlib.lines.Line2D at 0x1c969138970>,
<matplotlib.lines.Line2D at 0x1c969138c40>],
'boxes': [<matplotlib.lines.Line2D at 0x1c9691380d0>],
'medians': [<matplotlib.lines.Line2D at 0x1c969138f10>],
'fliers': [<matplotlib.lines.Line2D at 0x1c969145220>],
'means': []}
tf=pd.read_csv('titanic')
tf.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
plt.boxplot(tf.Fare)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c975ddb5e0>,
<matplotlib.lines.Line2D at 0x1c975ddb8b0>],
'caps': [<matplotlib.lines.Line2D at 0x1c975ddbb80>,
<matplotlib.lines.Line2D at 0x1c975ddbe50>],
'boxes': [<matplotlib.lines.Line2D at 0x1c975ddb310>],
'medians': [<matplotlib.lines.Line2D at 0x1c975deb160>],
'fliers': [<matplotlib.lines.Line2D at 0x1c975deb430>],
'means': []}
hf=pd.read_csv('House_price_data')
hf.head()
| LotArea | OverallQual | OverallCond | TotalBsmtSF | FullBath | HalfBath | BedroomAbvGr | TotRmsAbvGrd | Fireplaces | GarageArea | AboveMedianPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8450 | 7 | 5 | 856 | 2 | 1 | 3 | 8 | 0 | 548 | 1 |
| 1 | 9600 | 6 | 8 | 1262 | 2 | 0 | 3 | 6 | 1 | 460 | 1 |
| 2 | 11250 | 7 | 5 | 920 | 2 | 1 | 3 | 6 | 1 | 608 | 1 |
| 3 | 9550 | 7 | 5 | 756 | 1 | 0 | 3 | 7 | 1 | 642 | 0 |
| 4 | 14260 | 8 | 5 | 1145 | 2 | 1 | 4 | 9 | 1 | 836 | 1 |
house.columns
Index(['LotArea', 'OverallQual', 'OverallCond', 'TotalBsmtSF', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea',
'AboveMedianPrice'],
dtype='object')
plt.boxplot(hf.LotArea)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c975e3d9d0>,
<matplotlib.lines.Line2D at 0x1c975e3dca0>],
'caps': [<matplotlib.lines.Line2D at 0x1c975e3df70>,
<matplotlib.lines.Line2D at 0x1c975e4d2b0>],
'boxes': [<matplotlib.lines.Line2D at 0x1c975e3d700>],
'medians': [<matplotlib.lines.Line2D at 0x1c975e4d580>],
'fliers': [<matplotlib.lines.Line2D at 0x1c975e4d850>],
'means': []}
hf['LotArea'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
0.0 1300.0 0.1 5000.0 0.2 7078.4 0.3 8063.7 0.4 8793.4 0.5 9478.5 0.6 10198.2 0.7 11066.5 0.8 12205.8 0.9 14381.7 1.0 215245.0 Name: LotArea, dtype: float64
plt.boxplot(hf.TotalBsmtSF)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c975ea06a0>,
<matplotlib.lines.Line2D at 0x1c975ea0970>],
'caps': [<matplotlib.lines.Line2D at 0x1c975ea0c40>,
<matplotlib.lines.Line2D at 0x1c975ea0f10>],
'boxes': [<matplotlib.lines.Line2D at 0x1c975ea03d0>],
'medians': [<matplotlib.lines.Line2D at 0x1c975ead220>],
'fliers': [<matplotlib.lines.Line2D at 0x1c975ead4f0>],
'means': []}
import seaborn as sns
sns.boxplot(hf.TotalBsmtSF)
<AxesSubplot:xlabel='TotalBsmtSF'>
hf['TotalBsmtSF'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
0.0 0.0 0.1 636.9 0.2 755.8 0.3 840.0 0.4 910.0 0.5 991.5 0.6 1088.0 0.7 1216.0 0.8 1391.2 0.9 1602.2 1.0 6110.0 Name: TotalBsmtSF, dtype: float64
plt.boxplot(hf.GarageArea)
{'whiskers': [<matplotlib.lines.Line2D at 0x1c9761cc700>,
<matplotlib.lines.Line2D at 0x1c9761cc9d0>],
'caps': [<matplotlib.lines.Line2D at 0x1c9761ccca0>,
<matplotlib.lines.Line2D at 0x1c9761ccf70>],
'boxes': [<matplotlib.lines.Line2D at 0x1c9761cc400>],
'medians': [<matplotlib.lines.Line2D at 0x1c9761d9280>],
'fliers': [<matplotlib.lines.Line2D at 0x1c9761d9550>],
'means': []}
sns.boxplot(hf.GarageArea)
<AxesSubplot:xlabel='GarageArea'>
hf['GarageArea'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
0.0 0.0 0.1 240.0 0.2 295.6 0.3 384.0 0.4 440.0 0.5 480.0 0.6 516.0 0.7 560.0 0.8 620.2 0.9 757.1 1.0 1418.0 Name: GarageArea, dtype: float64
plt.scatter(hf.LotArea,hf.GarageArea)
plt.xlabel('LotArea')
plt.ylabel('GarageArea')
plt.show()
plt.scatter(hf.TotRmsAbvGrd,hf.LotArea)
plt.xlabel('LotArea')
plt.ylabel('GarageArea')
plt.show()
Z-Score
Wikipedia Definition:- the Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measure
from scipy .stats import zscore
import numpy as np
#any outlier less than 3 were not showing here
z=np.abs(zscore(hf))
z
| LotArea | OverallQual | OverallCond | TotalBsmtSF | FullBath | HalfBath | BedroomAbvGr | TotRmsAbvGrd | Fireplaces | GarageArea | AboveMedianPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.207142 | 0.651479 | 0.517200 | 0.459303 | 0.789741 | 1.227585 | 0.163779 | 0.912210 | 0.951226 | 0.351000 | 1.002743 |
| 1 | 0.091886 | 0.071836 | 2.179628 | 0.466465 | 0.789741 | 0.761621 | 0.163779 | 0.318683 | 0.600495 | 0.060731 | 1.002743 |
| 2 | 0.073480 | 0.651479 | 0.517200 | 0.313369 | 0.789741 | 1.227585 | 0.163779 | 0.318683 | 0.600495 | 0.631726 | 1.002743 |
| 3 | 0.096897 | 0.651479 | 0.517200 | 0.687324 | 1.026041 | 0.761621 | 0.163779 | 0.296763 | 0.600495 | 0.790804 | 0.997264 |
| 4 | 0.375148 | 1.374795 | 0.517200 | 0.199680 | 0.789741 | 1.227585 | 1.390023 | 1.527656 | 0.600495 | 1.698485 | 1.002743 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 0.260560 | 0.071836 | 0.517200 | 0.238122 | 0.789741 | 1.227585 | 0.163779 | 0.296763 | 0.600495 | 0.060731 | 1.002743 |
| 1456 | 0.266407 | 0.071836 | 0.381743 | 1.104925 | 0.789741 | 0.761621 | 0.163779 | 0.296763 | 2.152216 | 0.126420 | 1.002743 |
| 1457 | 0.147810 | 0.651479 | 3.078570 | 0.215641 | 0.789741 | 0.761621 | 1.390023 | 1.527656 | 2.152216 | 1.033914 | 1.002743 |
| 1458 | 0.080160 | 0.795151 | 0.381743 | 0.046905 | 1.026041 | 0.761621 | 1.062465 | 0.934130 | 0.951226 | 1.090059 | 0.997264 |
| 1459 | 0.058112 | 0.795151 | 0.381743 | 0.452784 | 1.026041 | 1.227585 | 0.163779 | 0.318683 | 0.951226 | 0.921624 | 0.997264 |
1460 rows × 11 columns
Looking the code output above, its difficult to say which data point is an outlier. let's try and define a threshold to identify an outlier
threshold=3
print(np.where(z>3))
(array([ 53, 53, 88, 144, 166, 178, 185, 185, 188, 189, 191,
218, 224, 241, 249, 250, 291, 298, 304, 309, 313, 330,
332, 335, 375, 375, 378, 384, 398, 440, 451, 457, 461,
496, 508, 519, 523, 533, 570, 581, 583, 597, 605, 624,
628, 634, 635, 635, 642, 661, 664, 676, 691, 703, 706,
726, 745, 769, 769, 803, 825, 843, 848, 897, 897, 910,
921, 980, 991, 1031, 1044, 1061, 1123, 1149, 1154, 1163, 1163,
1173, 1182, 1190, 1213, 1213, 1230, 1230, 1268, 1270, 1283, 1298,
1298, 1298, 1298, 1298, 1327, 1350, 1350, 1350, 1352, 1373, 1386,
1396, 1435, 1450, 1457], dtype=int64), array([0, 6, 2, 6, 8, 9, 2, 7, 5, 6, 2, 2, 3, 2, 0, 2, 6, 5, 2, 8, 0, 6,
3, 0, 1, 2, 2, 0, 2, 3, 0, 0, 2, 3, 2, 2, 3, 1, 6, 9, 2, 5, 8, 5,
5, 6, 6, 7, 8, 0, 9, 2, 3, 2, 0, 2, 2, 0, 7, 7, 9, 6, 0, 6, 7, 7,
5, 2, 2, 7, 3, 9, 2, 2, 5, 5, 6, 7, 3, 9, 2, 6, 5, 7, 2, 6, 5, 0,
3, 7, 8, 9, 2, 5, 6, 7, 2, 3, 7, 0, 2, 5, 2], dtype=int64))
# array 1 is holding the row number and array 2 is holding the column
#all the values are greater than 3
#z[row][column]
z.iloc[53][0]
3.98424384357879
z.iloc[53,6]
3.514951815977729
z.iloc[88,2]
3.2140273717438843
z.iloc[144,6]
3.842510049724758
Q1=hf.quantile(0.25)
Q3=hf.quantile(0.75)
Q1
LotArea 7553.50 OverallQual 5.00 OverallCond 5.00 TotalBsmtSF 795.75 FullBath 1.00 HalfBath 0.00 BedroomAbvGr 2.00 TotRmsAbvGrd 5.00 Fireplaces 0.00 GarageArea 334.50 AboveMedianPrice 0.00 Name: 0.25, dtype: float64
Q3
LotArea 11601.50 OverallQual 7.00 OverallCond 6.00 TotalBsmtSF 1298.25 FullBath 2.00 HalfBath 1.00 BedroomAbvGr 3.00 TotRmsAbvGrd 7.00 Fireplaces 1.00 GarageArea 576.00 AboveMedianPrice 1.00 Name: 0.75, dtype: float64
IQR=Q3-Q1
IQR
LotArea 4048.0 OverallQual 2.0 OverallCond 1.0 TotalBsmtSF 502.5 FullBath 1.0 HalfBath 1.0 BedroomAbvGr 1.0 TotRmsAbvGrd 2.0 Fireplaces 1.0 GarageArea 241.5 AboveMedianPrice 1.0 dtype: float64
in the previous section we saw how to discover outlier using Z-score but now we gonna remove or filter the outliers and get the clean data.this can be done with just one line code as we have already calculate the Z-score
hf_new=hf[(z<3).all(axis=1)]
hf_new
| LotArea | OverallQual | OverallCond | TotalBsmtSF | FullBath | HalfBath | BedroomAbvGr | TotRmsAbvGrd | Fireplaces | GarageArea | AboveMedianPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8450 | 7 | 5 | 856 | 2 | 1 | 3 | 8 | 0 | 548 | 1 |
| 1 | 9600 | 6 | 8 | 1262 | 2 | 0 | 3 | 6 | 1 | 460 | 1 |
| 2 | 11250 | 7 | 5 | 920 | 2 | 1 | 3 | 6 | 1 | 608 | 1 |
| 3 | 9550 | 7 | 5 | 756 | 1 | 0 | 3 | 7 | 1 | 642 | 0 |
| 4 | 14260 | 8 | 5 | 1145 | 2 | 1 | 4 | 9 | 1 | 836 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1454 | 7500 | 7 | 5 | 1221 | 2 | 0 | 2 | 6 | 0 | 400 | 1 |
| 1455 | 7917 | 6 | 5 | 953 | 2 | 1 | 3 | 7 | 1 | 460 | 1 |
| 1456 | 13175 | 6 | 6 | 1542 | 2 | 0 | 3 | 7 | 2 | 500 | 1 |
| 1458 | 9717 | 5 | 6 | 1078 | 1 | 0 | 2 | 5 | 0 | 240 | 0 |
| 1459 | 9937 | 5 | 6 | 1256 | 1 | 1 | 3 | 6 | 0 | 276 | 0 |
1372 rows × 11 columns
hf.shape
(1460, 11)
hf_new.shape
(1372, 11)
so, the above hf_new show that outlier have been removed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')
wine=pd.read_csv('winequality-red.csv')
wine.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
wine.shape
(1599, 12)
wine.dtypes
fixed acidity float64 volatile acidity float64 citric acid float64 residual sugar float64 chlorides float64 free sulfur dioxide float64 total sulfur dioxide float64 density float64 pH float64 sulphates float64 alcohol float64 quality int64 dtype: object
wine.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
wine.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
wine.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
key observations
Exploring data variable
wine.quality.unique()
array([5, 6, 7, 4, 8, 3], dtype=int64)
wine.quality.value_counts()
5 681 6 638 7 199 4 53 8 18 3 10 Name: quality, dtype: int64
#To check missing values
sns.heatmap(wine.isnull())
<AxesSubplot:>
Dataset has no missing values. if there were any, you would've noticed in the figure represented by difference colour shade
dfcor=wine.corr()
dfcor
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1.000000 | -0.256131 | 0.671703 | 0.114777 | 0.093705 | -0.153794 | -0.113181 | 0.668047 | -0.682978 | 0.183006 | -0.061668 | 0.124052 |
| volatile acidity | -0.256131 | 1.000000 | -0.552496 | 0.001918 | 0.061298 | -0.010504 | 0.076470 | 0.022026 | 0.234937 | -0.260987 | -0.202288 | -0.390558 |
| citric acid | 0.671703 | -0.552496 | 1.000000 | 0.143577 | 0.203823 | -0.060978 | 0.035533 | 0.364947 | -0.541904 | 0.312770 | 0.109903 | 0.226373 |
| residual sugar | 0.114777 | 0.001918 | 0.143577 | 1.000000 | 0.055610 | 0.187049 | 0.203028 | 0.355283 | -0.085652 | 0.005527 | 0.042075 | 0.013732 |
| chlorides | 0.093705 | 0.061298 | 0.203823 | 0.055610 | 1.000000 | 0.005562 | 0.047400 | 0.200632 | -0.265026 | 0.371260 | -0.221141 | -0.128907 |
| free sulfur dioxide | -0.153794 | -0.010504 | -0.060978 | 0.187049 | 0.005562 | 1.000000 | 0.667666 | -0.021946 | 0.070377 | 0.051658 | -0.069408 | -0.050656 |
| total sulfur dioxide | -0.113181 | 0.076470 | 0.035533 | 0.203028 | 0.047400 | 0.667666 | 1.000000 | 0.071269 | -0.066495 | 0.042947 | -0.205654 | -0.185100 |
| density | 0.668047 | 0.022026 | 0.364947 | 0.355283 | 0.200632 | -0.021946 | 0.071269 | 1.000000 | -0.341699 | 0.148506 | -0.496180 | -0.174919 |
| pH | -0.682978 | 0.234937 | -0.541904 | -0.085652 | -0.265026 | 0.070377 | -0.066495 | -0.341699 | 1.000000 | -0.196648 | 0.205633 | -0.057731 |
| sulphates | 0.183006 | -0.260987 | 0.312770 | 0.005527 | 0.371260 | 0.051658 | 0.042947 | 0.148506 | -0.196648 | 1.000000 | 0.093595 | 0.251397 |
| alcohol | -0.061668 | -0.202288 | 0.109903 | 0.042075 | -0.221141 | -0.069408 | -0.205654 | -0.496180 | 0.205633 | 0.093595 | 1.000000 | 0.476166 |
| quality | 0.124052 | -0.390558 | 0.226373 | 0.013732 | -0.128907 | -0.050656 | -0.185100 | -0.174919 | -0.057731 | 0.251397 | 0.476166 | 1.000000 |
sns.heatmap(dfcor)
<AxesSubplot:>
plt.figure(figsize=(6,4))
sns.heatmap(dfcor,cmap='Blues',annot=True)
<AxesSubplot:>
plt.figure(figsize=(10,6))
sns.heatmap(dfcor,cmap='Blues',annot=True)
<AxesSubplot:>
wine.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
#univariable analysis
wine['fixed acidity'].plot.box()
<AxesSubplot:>
wine['density'].plot.box()
<AxesSubplot:>
wine['alcohol'].plot.box()
<AxesSubplot:>
wine['free sulfur dioxide'].plot.box()
<AxesSubplot:>
wine['total sulfur dioxide'].plot.box()
<AxesSubplot:>
wine.shape
(1599, 12)
wine.plot(kind='box',subplots=True,layout=(2,6),figsize=(10,10))
fixed acidity AxesSubplot(0.125,0.53;0.110714x0.35) volatile acidity AxesSubplot(0.257857,0.53;0.110714x0.35) citric acid AxesSubplot(0.390714,0.53;0.110714x0.35) residual sugar AxesSubplot(0.523571,0.53;0.110714x0.35) chlorides AxesSubplot(0.656429,0.53;0.110714x0.35) free sulfur dioxide AxesSubplot(0.789286,0.53;0.110714x0.35) total sulfur dioxide AxesSubplot(0.125,0.11;0.110714x0.35) density AxesSubplot(0.257857,0.11;0.110714x0.35) pH AxesSubplot(0.390714,0.11;0.110714x0.35) sulphates AxesSubplot(0.523571,0.11;0.110714x0.35) alcohol AxesSubplot(0.656429,0.11;0.110714x0.35) quality AxesSubplot(0.789286,0.11;0.110714x0.35) dtype: object
sns.distplot(wine['density'])
<AxesSubplot:xlabel='density', ylabel='Density'>
sns.distplot(wine['citric acid'])
<AxesSubplot:xlabel='citric acid', ylabel='Density'>
wine.plot(kind='kde',subplots=True,layout=(2,6),figsize=(15,6))
array([[<AxesSubplot:ylabel='Density'>, <AxesSubplot:ylabel='Density'>,
<AxesSubplot:ylabel='Density'>, <AxesSubplot:ylabel='Density'>,
<AxesSubplot:ylabel='Density'>, <AxesSubplot:ylabel='Density'>],
[<AxesSubplot:ylabel='Density'>, <AxesSubplot:ylabel='Density'>,
<AxesSubplot:ylabel='Density'>, <AxesSubplot:ylabel='Density'>,
<AxesSubplot:ylabel='Density'>, <AxesSubplot:ylabel='Density'>]],
dtype=object)
#bivariate analysis
plt.scatter(wine['pH'],wine['quality'])
<matplotlib.collections.PathCollection at 0x1c2459fcd30>
sns.pairplot(wine)
<seaborn.axisgrid.PairGrid at 0x1c24e137670>
# remove the missing values
# drop the negatively correlated columns
#remove the outliers
# Data cleaning is done
plt.scatter(wine['volatile acidity'],wine['quality'])
plt.show()
wine.drop('volatile acidity',axis=1,inplace=True)
wine.head()
| fixed acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
wine.shape
(1599, 11)
from scipy.stats import zscore
z=np.abs(zscore(wine))
z
| fixed acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.528360 | 1.391472 | 0.453218 | 0.243707 | 0.466193 | 0.379133 | 0.558274 | 1.288643 | 0.579207 | 0.960246 | 0.787823 |
| 1 | 0.298547 | 1.391472 | 0.043416 | 0.223875 | 0.872638 | 0.624363 | 0.028261 | 0.719933 | 0.128950 | 0.584777 | 0.787823 |
| 2 | 0.298547 | 1.186070 | 0.169427 | 0.096353 | 0.083669 | 0.229047 | 0.134264 | 0.331177 | 0.048089 | 0.584777 | 0.787823 |
| 3 | 1.654856 | 1.484154 | 0.453218 | 0.264960 | 0.107592 | 0.411500 | 0.664277 | 0.979104 | 0.461180 | 0.584777 | 0.450848 |
| 4 | 0.528360 | 1.391472 | 0.453218 | 0.243707 | 0.466193 | 0.379133 | 0.558274 | 1.288643 | 0.579207 | 0.960246 | 0.787823 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1594 | 1.217796 | 0.980669 | 0.382271 | 0.053845 | 1.542054 | 0.075043 | 0.978765 | 0.899886 | 0.461180 | 0.072294 | 0.787823 |
| 1595 | 1.390155 | 0.877968 | 0.240375 | 0.541259 | 2.211469 | 0.137820 | 0.862162 | 1.353436 | 0.601055 | 0.729364 | 0.450848 |
| 1596 | 1.160343 | 0.723916 | 0.169427 | 0.243707 | 1.255161 | 0.196679 | 0.533554 | 0.705508 | 0.542042 | 0.541630 | 0.450848 |
| 1597 | 1.390155 | 0.775267 | 0.382271 | 0.264960 | 1.542054 | 0.075043 | 0.676657 | 1.677400 | 0.305990 | 0.209308 | 0.787823 |
| 1598 | 1.332702 | 1.021999 | 0.752894 | 0.434990 | 0.203223 | 0.135861 | 0.666057 | 0.511130 | 0.010924 | 0.541630 | 0.450848 |
1599 rows × 11 columns
threshold=3
print(np.where(z>3))
(array([ 13, 14, 15, 15, 17, 17, 19, 33, 42, 43, 45,
57, 81, 81, 83, 86, 88, 91, 92, 95, 106, 106,
109, 142, 144, 147, 151, 151, 151, 151, 163, 164, 169,
169, 181, 226, 226, 240, 243, 244, 258, 258, 274, 281,
291, 324, 325, 339, 340, 347, 354, 374, 381, 391, 396,
396, 400, 400, 442, 442, 451, 459, 467, 480, 480, 494,
515, 517, 544, 554, 554, 555, 555, 557, 557, 568, 584,
588, 591, 595, 608, 614, 636, 639, 649, 649, 651, 652,
652, 652, 672, 684, 690, 692, 692, 695, 723, 730, 754,
776, 777, 795, 821, 832, 836, 837, 889, 899, 911, 917,
923, 925, 926, 982, 1017, 1018, 1043, 1051, 1051, 1071, 1074,
1079, 1079, 1081, 1081, 1111, 1114, 1131, 1154, 1165, 1175, 1186,
1231, 1235, 1244, 1244, 1244, 1260, 1269, 1269, 1270, 1270, 1288,
1289, 1295, 1296, 1299, 1300, 1316, 1319, 1319, 1321, 1358, 1367,
1370, 1370, 1372, 1372, 1374, 1374, 1434, 1434, 1434, 1435, 1435,
1435, 1469, 1474, 1474, 1474, 1476, 1476, 1476, 1478, 1493, 1496,
1505, 1558, 1558, 1570, 1574, 1589], dtype=int64), array([ 8, 4, 4, 5, 3, 8, 3, 2, 3, 8, 7, 4, 3, 8, 3, 8, 8,
8, 8, 7, 3, 8, 5, 9, 9, 3, 1, 3, 7, 8, 2, 2, 3, 8,
3, 3, 8, 3, 0, 0, 3, 8, 2, 3, 3, 2, 2, 8, 8, 0, 5,
0, 0, 0, 2, 4, 2, 4, 0, 6, 3, 10, 9, 2, 6, 2, 5, 10,
0, 0, 6, 0, 6, 0, 6, 3, 4, 9, 5, 2, 6, 8, 5, 8, 2,
5, 5, 0, 2, 9, 5, 5, 10, 3, 8, 7, 8, 3, 3, 3, 3, 8,
9, 10, 6, 6, 6, 10, 2, 2, 2, 4, 4, 4, 6, 6, 2, 3, 8,
2, 2, 2, 5, 2, 5, 7, 6, 4, 4, 3, 4, 2, 4, 2, 2, 4,
5, 3, 6, 9, 6, 9, 8, 8, 4, 4, 10, 7, 7, 3, 8, 7, 4,
8, 3, 8, 3, 8, 3, 10, 2, 4, 6, 2, 4, 6, 10, 2, 4, 6,
2, 4, 6, 10, 5, 5, 10, 3, 4, 3, 2, 2], dtype=int64))
#row and columns
z.iloc[13,8]
5.32210116796055
z.iloc[15,5]
3.087489568732511
wine_new=wine[(z<3).all(axis=1)]
wine_new
| fixed acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1594 | 6.2 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 |
| 1595 | 5.9 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 |
| 1596 | 6.3 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 |
| 1597 | 5.9 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 |
| 1598 | 6.0 | 0.47 | 3.6 | 0.067 | 18.0 | 42.0 | 0.99549 | 3.39 | 0.66 | 11.0 | 6 |
1458 rows × 11 columns
wine.shape
(1599, 11)
wine_new.shape
(1458, 11)
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('brain_size.csv',sep=';',na_values=".")
df
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 133 | 132 | 124 | 118.0 | 64.5 | 816932 |
| 1 | 2 | Male | 140 | 150 | 124 | NaN | 72.5 | 1001121 |
| 2 | 3 | Male | 139 | 123 | 150 | 143.0 | 73.3 | 1038437 |
| 3 | 4 | Male | 133 | 129 | 128 | 172.0 | 68.8 | 965353 |
| 4 | 5 | Female | 137 | 132 | 134 | 147.0 | 65.0 | 951545 |
| 5 | 6 | Female | 99 | 90 | 110 | 146.0 | 69.0 | 928799 |
| 6 | 7 | Female | 138 | 136 | 131 | 138.0 | 64.5 | 991305 |
| 7 | 8 | Female | 92 | 90 | 98 | 175.0 | 66.0 | 854258 |
| 8 | 9 | Male | 89 | 93 | 84 | 134.0 | 66.3 | 904858 |
| 9 | 10 | Male | 133 | 114 | 147 | 172.0 | 68.8 | 955466 |
| 10 | 11 | Female | 132 | 129 | 124 | 118.0 | 64.5 | 833868 |
| 11 | 12 | Male | 141 | 150 | 128 | 151.0 | 70.0 | 1079549 |
| 12 | 13 | Male | 135 | 129 | 124 | 155.0 | 69.0 | 924059 |
| 13 | 14 | Female | 140 | 120 | 147 | 155.0 | 70.5 | 856472 |
| 14 | 15 | Female | 96 | 100 | 90 | 146.0 | 66.0 | 878897 |
| 15 | 16 | Female | 83 | 71 | 96 | 135.0 | 68.0 | 865363 |
| 16 | 17 | Female | 132 | 132 | 120 | 127.0 | 68.5 | 852244 |
| 17 | 18 | Male | 100 | 96 | 102 | 178.0 | 73.5 | 945088 |
| 18 | 19 | Female | 101 | 112 | 84 | 136.0 | 66.3 | 808020 |
| 19 | 20 | Male | 80 | 77 | 86 | 180.0 | 70.0 | 889083 |
| 20 | 21 | Male | 83 | 83 | 86 | NaN | NaN | 892420 |
| 21 | 22 | Male | 97 | 107 | 84 | 186.0 | 76.5 | 905940 |
| 22 | 23 | Female | 135 | 129 | 134 | 122.0 | 62.0 | 790619 |
| 23 | 24 | Male | 139 | 145 | 128 | 132.0 | 68.0 | 955003 |
| 24 | 25 | Female | 91 | 86 | 102 | 114.0 | 63.0 | 831772 |
| 25 | 26 | Male | 141 | 145 | 131 | 171.0 | 72.0 | 935494 |
| 26 | 27 | Female | 85 | 90 | 84 | 140.0 | 68.0 | 798612 |
| 27 | 28 | Male | 103 | 96 | 110 | 187.0 | 77.0 | 1062462 |
| 28 | 29 | Female | 77 | 83 | 72 | 106.0 | 63.0 | 793549 |
| 29 | 30 | Female | 130 | 126 | 124 | 159.0 | 66.5 | 866662 |
| 30 | 31 | Female | 133 | 126 | 132 | 127.0 | 62.5 | 857782 |
| 31 | 32 | Male | 144 | 145 | 137 | 191.0 | 67.0 | 949589 |
| 32 | 33 | Male | 103 | 96 | 110 | 192.0 | 75.5 | 997925 |
| 33 | 34 | Male | 90 | 96 | 86 | 181.0 | 69.0 | 879987 |
| 34 | 35 | Female | 83 | 90 | 81 | 143.0 | 66.5 | 834344 |
| 35 | 36 | Female | 133 | 129 | 128 | 153.0 | 66.5 | 948066 |
| 36 | 37 | Male | 140 | 150 | 124 | 144.0 | 70.5 | 949395 |
| 37 | 38 | Female | 88 | 86 | 94 | 139.0 | 64.5 | 893983 |
| 38 | 39 | Male | 81 | 90 | 74 | 148.0 | 74.0 | 930016 |
| 39 | 40 | Male | 89 | 91 | 89 | 179.0 | 75.5 | 935863 |
df.head()
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 133 | 132 | 124 | 118.0 | 64.5 | 816932 |
| 1 | 2 | Male | 140 | 150 | 124 | NaN | 72.5 | 1001121 |
| 2 | 3 | Male | 139 | 123 | 150 | 143.0 | 73.3 | 1038437 |
| 3 | 4 | Male | 133 | 129 | 128 | 172.0 | 68.8 | 965353 |
| 4 | 5 | Female | 137 | 132 | 134 | 147.0 | 65.0 | 951545 |
df.head(10)
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 133 | 132 | 124 | 118.0 | 64.5 | 816932 |
| 1 | 2 | Male | 140 | 150 | 124 | NaN | 72.5 | 1001121 |
| 2 | 3 | Male | 139 | 123 | 150 | 143.0 | 73.3 | 1038437 |
| 3 | 4 | Male | 133 | 129 | 128 | 172.0 | 68.8 | 965353 |
| 4 | 5 | Female | 137 | 132 | 134 | 147.0 | 65.0 | 951545 |
| 5 | 6 | Female | 99 | 90 | 110 | 146.0 | 69.0 | 928799 |
| 6 | 7 | Female | 138 | 136 | 131 | 138.0 | 64.5 | 991305 |
| 7 | 8 | Female | 92 | 90 | 98 | 175.0 | 66.0 | 854258 |
| 8 | 9 | Male | 89 | 93 | 84 | 134.0 | 66.3 | 904858 |
| 9 | 10 | Male | 133 | 114 | 147 | 172.0 | 68.8 | 955466 |
df.tail()
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 35 | 36 | Female | 133 | 129 | 128 | 153.0 | 66.5 | 948066 |
| 36 | 37 | Male | 140 | 150 | 124 | 144.0 | 70.5 | 949395 |
| 37 | 38 | Female | 88 | 86 | 94 | 139.0 | 64.5 | 893983 |
| 38 | 39 | Male | 81 | 90 | 74 | 148.0 | 74.0 | 930016 |
| 39 | 40 | Male | 89 | 91 | 89 | 179.0 | 75.5 | 935863 |
df.sample()
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 133 | 132 | 124 | 118.0 | 64.5 | 816932 |
df.shape
(40, 8)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40 entries, 0 to 39 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 40 non-null int64 1 Gender 40 non-null object 2 FSIQ 40 non-null int64 3 VIQ 40 non-null int64 4 PIQ 40 non-null int64 5 Weight 38 non-null float64 6 Height 39 non-null float64 7 MRI_Count 40 non-null int64 dtypes: float64(2), int64(5), object(1) memory usage: 2.6+ KB
df.dtypes
id int64 Gender object FSIQ int64 VIQ int64 PIQ int64 Weight float64 Height float64 MRI_Count int64 dtype: object
df.isnull()
| id | Gender | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | True | False | False |
| 2 | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False |
| 5 | False | False | False | False | False | False | False | False |
| 6 | False | False | False | False | False | False | False | False |
| 7 | False | False | False | False | False | False | False | False |
| 8 | False | False | False | False | False | False | False | False |
| 9 | False | False | False | False | False | False | False | False |
| 10 | False | False | False | False | False | False | False | False |
| 11 | False | False | False | False | False | False | False | False |
| 12 | False | False | False | False | False | False | False | False |
| 13 | False | False | False | False | False | False | False | False |
| 14 | False | False | False | False | False | False | False | False |
| 15 | False | False | False | False | False | False | False | False |
| 16 | False | False | False | False | False | False | False | False |
| 17 | False | False | False | False | False | False | False | False |
| 18 | False | False | False | False | False | False | False | False |
| 19 | False | False | False | False | False | False | False | False |
| 20 | False | False | False | False | False | True | True | False |
| 21 | False | False | False | False | False | False | False | False |
| 22 | False | False | False | False | False | False | False | False |
| 23 | False | False | False | False | False | False | False | False |
| 24 | False | False | False | False | False | False | False | False |
| 25 | False | False | False | False | False | False | False | False |
| 26 | False | False | False | False | False | False | False | False |
| 27 | False | False | False | False | False | False | False | False |
| 28 | False | False | False | False | False | False | False | False |
| 29 | False | False | False | False | False | False | False | False |
| 30 | False | False | False | False | False | False | False | False |
| 31 | False | False | False | False | False | False | False | False |
| 32 | False | False | False | False | False | False | False | False |
| 33 | False | False | False | False | False | False | False | False |
| 34 | False | False | False | False | False | False | False | False |
| 35 | False | False | False | False | False | False | False | False |
| 36 | False | False | False | False | False | False | False | False |
| 37 | False | False | False | False | False | False | False | False |
| 38 | False | False | False | False | False | False | False | False |
| 39 | False | False | False | False | False | False | False | False |
df.isnull().sum()
id 0 Gender 0 FSIQ 0 VIQ 0 PIQ 0 Weight 2 Height 1 MRI_Count 0 dtype: int64
sns.heatmap(df.isnull())
<AxesSubplot:>
df.columns
Index(['id', 'Gender', 'FSIQ', 'VIQ', 'PIQ', 'Weight', 'Height', 'MRI_Count'], dtype='object')
df.describe()
| id | FSIQ | VIQ | PIQ | Weight | Height | MRI_Count | |
|---|---|---|---|---|---|---|---|
| count | 40.000000 | 40.000000 | 40.000000 | 40.00000 | 38.000000 | 39.000000 | 4.000000e+01 |
| mean | 20.500000 | 113.450000 | 112.350000 | 111.02500 | 151.052632 | 68.525641 | 9.087550e+05 |
| std | 11.690452 | 24.082071 | 23.616107 | 22.47105 | 23.478509 | 3.994649 | 7.228205e+04 |
| min | 1.000000 | 77.000000 | 71.000000 | 72.00000 | 106.000000 | 62.000000 | 7.906190e+05 |
| 25% | 10.750000 | 89.750000 | 90.000000 | 88.25000 | 135.250000 | 66.000000 | 8.559185e+05 |
| 50% | 20.500000 | 116.500000 | 113.000000 | 115.00000 | 146.500000 | 68.000000 | 9.053990e+05 |
| 75% | 30.250000 | 135.500000 | 129.750000 | 128.00000 | 172.000000 | 70.500000 | 9.500780e+05 |
| max | 40.000000 | 144.000000 | 150.000000 | 150.00000 | 192.000000 | 77.000000 | 1.079549e+06 |
sns.countplot(x='Gender',data=df)
<AxesSubplot:xlabel='Gender', ylabel='count'>
sns.displot(df['Height'],bins=10)
<seaborn.axisgrid.FacetGrid at 0x1c2459780a0>
sns.displot(df['Weight'],kde=False,bins=20)
<seaborn.axisgrid.FacetGrid at 0x1c25b85a070>
df['Weight'].plot.box()
<AxesSubplot:>
df.columns
Index(['id', 'Gender', 'FSIQ', 'VIQ', 'PIQ', 'Weight', 'Height', 'MRI_Count'], dtype='object')
sns.boxplot(y='FSIQ',data=df)
<AxesSubplot:ylabel='FSIQ'>
sns.boxplot(y='VIQ',data=df)
<AxesSubplot:ylabel='VIQ'>
df['PIQ'].plot.box()
<AxesSubplot:>
df.isnull().sum()
id 0 Gender 0 FSIQ 0 VIQ 0 PIQ 0 Weight 2 Height 1 MRI_Count 0 dtype: int64
#df.fillna?
import numpy as np
df['Weight'].replace(np.NaN,df['Weight'].mean,inplace=True)
df.isnull().sum()
id 0 Gender 0 FSIQ 0 VIQ 0 PIQ 0 Weight 0 Height 1 MRI_Count 0 dtype: int64
#drop missing values along with rows
df.dropna(inplace=True)
df.isnull().sum()
id 0 Gender 0 FSIQ 0 VIQ 0 PIQ 0 Weight 0 Height 0 MRI_Count 0 dtype: int64
sns.heatmap(df.isnull())
<AxesSubplot:>
sns.heatmap(df.corr())
<AxesSubplot:>
sns.heatmap(df.corr(),annot=True)
<AxesSubplot:>